diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..bd6120e534915dba6dbb4e7599b9746b60e19a86 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-114/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-342/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-456/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-684/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5c526ddfa14ccbc17e50d783309db19280521b8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,146 @@
+---
+library_name: peft
+license: other
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+tags:
+- generated_from_trainer
+datasets:
+- ugaoo/multimedqa_and_wrongonesllama
+model-index:
+- name: out/multimedqa_and_wrongonesllama
+ results: []
+---
+
+
+
+[
](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.8.0.dev0`
+```yaml
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: ugaoo/multimedqa_and_wrongonesllama
+ type: alpaca
+val_set_size: 0
+output_dir: ./out/multimedqa_and_wrongonesllama
+
+sequence_len: 4000
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 512
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+ - q_proj
+ - k_proj
+ - v_proj
+ - o_proj
+ - up_proj
+ - down_proj
+ - gate_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project: cosmosearch
+wandb_entity:
+wandb_watch:
+wandb_name: multimedqa_and_wrongonesllama_Super-49B
+wandb_log_model:
+
+gradient_accumulation_steps: 3
+micro_batch_size: 4
+num_epochs: 6
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-6
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 6
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+save_total_limit: 6
+special_tokens:
+ pad_token: <|end_of_text|>
+
+```
+
+
+
+# out/multimedqa_and_wrongonesllama
+
+This model is a fine-tuned version of [nvidia/Llama-3_3-Nemotron-Super-49B-v1](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1) on the ugaoo/multimedqa_and_wrongonesllama dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 3
+- total_train_batch_size: 24
+- total_eval_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 6.0
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.15.0
+- Transformers 4.49.0
+- Pytorch 2.5.1+cu124
+- Datasets 3.4.1
+- Tokenizers 0.21.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d071e8337a127c8780a346e6e69c4e2195786154
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c08cabaa331365104eda0f955b3bcca40f58f5ba2408e03aedf9cc235c104191
+size 9016826528
diff --git a/checkpoint-114/README.md b/checkpoint-114/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-114/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-114/adapter_config.json b/checkpoint-114/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-114/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-114/adapter_model.safetensors b/checkpoint-114/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f4239b78cce51457a023d3b245d6dd89bd6bbe36
--- /dev/null
+++ b/checkpoint-114/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9b6e9543d7c2d41d3306ac6f0fe4cda7267eece06c8587fec3a68b8ba04243
+size 9016826528
diff --git a/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a84d1cc265afdbab6917598c97c7b483552c253
--- /dev/null
+++ b/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eca3ffba8bfb5906d22a15bd9ccc52f42b5339056f7a5836afbe7c8a33cbfbb5
+size 27050164444
diff --git a/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7c5a252a0149e81136e00af57a943b5571ecc98
--- /dev/null
+++ b/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8de04aa8407a0004079a71b924943aa7468b24af275b648a69a53fe0c20db5e
+size 27050169884
diff --git a/checkpoint-114/global_step114/mp_rank_00_model_states.pt b/checkpoint-114/global_step114/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c9d96534454ef44d808e002132b2bc109f507cd
--- /dev/null
+++ b/checkpoint-114/global_step114/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0478eba317b4d5b9acff1bf96c6c89ae28a8b2e1080f0575c3b24bd186b155
+size 9776788601
diff --git a/checkpoint-114/latest b/checkpoint-114/latest
new file mode 100644
index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1
--- /dev/null
+++ b/checkpoint-114/latest
@@ -0,0 +1 @@
+global_step114
\ No newline at end of file
diff --git a/checkpoint-114/rng_state_0.pth b/checkpoint-114/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3115ef5b3f240303888fd17b7517182de213d964
--- /dev/null
+++ b/checkpoint-114/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acededc55cf300dac4729a8ab7c731573a49bfe522164173f4aa200189894bf7
+size 14512
diff --git a/checkpoint-114/rng_state_1.pth b/checkpoint-114/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8677a911d6d783cf6a6dc5b8b13f6dd17eca4720
--- /dev/null
+++ b/checkpoint-114/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:174dbd9d3bdd2e47a45b6b645ec401c6d6b33e4bf885128debfda1d5649a747a
+size 14512
diff --git a/checkpoint-114/scheduler.pt b/checkpoint-114/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40c17d3863167c8f9a6afd933a45e93fda7d96e4
--- /dev/null
+++ b/checkpoint-114/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ad006ca6fd06276c4f0d747b779fbbcfdff6edce744bcfd757e846b0536c240
+size 1064
diff --git a/checkpoint-114/special_tokens_map.json b/checkpoint-114/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-114/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-114/tokenizer.json b/checkpoint-114/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-114/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-114/tokenizer_config.json b/checkpoint-114/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-114/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-114/trainer_state.json b/checkpoint-114/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c75ec46ce1ce29382194d526c7ccab18b22e1bc7
--- /dev/null
+++ b/checkpoint-114/trainer_state.json
@@ -0,0 +1,831 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 500,
+ "global_step": 114,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.45999007414852e+18,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-114/training_args.bin b/checkpoint-114/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-114/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-114/zero_to_fp32.py b/checkpoint-114/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-114/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-228/README.md b/checkpoint-228/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-228/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-228/adapter_config.json b/checkpoint-228/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-228/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-228/adapter_model.safetensors b/checkpoint-228/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a68b30f953577754becff0c56a3018c6e48f3d1b
--- /dev/null
+++ b/checkpoint-228/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7595d2ffb7b408a9a2b9933fb1eb962a9e37c2d3c114ce50e86160ff0a1720a2
+size 9016826528
diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..417b082643685d5aac1e14bacc4b52c0adfa670d
--- /dev/null
+++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:613a22985902c9d1d1be9c9ce2dab87a331de51fd89da9348dd5f43ec07cd409
+size 27050164444
diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..15369f5d428ce41de2b0b4778e7740d3df195dc6
--- /dev/null
+++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5620d8ab8d21b3d10f4b88326c726459339e432ae6fc1dc2205c16dc137fce0e
+size 27050169884
diff --git a/checkpoint-228/global_step228/mp_rank_00_model_states.pt b/checkpoint-228/global_step228/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b636fabd7086a0a4fc88eefd869ed6fbdf83123d
--- /dev/null
+++ b/checkpoint-228/global_step228/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24fb10124c58ab3bc95ccb3b9dcc7a5bdeefd22a99b84346cf3c1df9f76c8c27
+size 9776788601
diff --git a/checkpoint-228/latest b/checkpoint-228/latest
new file mode 100644
index 0000000000000000000000000000000000000000..74f667dd5aec7b1dcf458da255b4d04f2e864037
--- /dev/null
+++ b/checkpoint-228/latest
@@ -0,0 +1 @@
+global_step228
\ No newline at end of file
diff --git a/checkpoint-228/rng_state_0.pth b/checkpoint-228/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a5d14df7a6086589916370411c87ca4b0ff67991
--- /dev/null
+++ b/checkpoint-228/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df88b39032cfb5865c667e31cb370a479cdab725990452a7f491c7100c7266f
+size 14512
diff --git a/checkpoint-228/rng_state_1.pth b/checkpoint-228/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b56566661121ee55636a0083720baa794abae012
--- /dev/null
+++ b/checkpoint-228/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fccaad5cbd19ebb15866094c25b042ca7260a9e174b4a8e2a720bae96eb35fe
+size 14512
diff --git a/checkpoint-228/scheduler.pt b/checkpoint-228/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb6c4bace72a7b1c8de145936466d2b1e4a21463
--- /dev/null
+++ b/checkpoint-228/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6a932650a4637e48ce03bf2825ccc9a1ed4f05bb0a73538a68ddc440b889a8
+size 1064
diff --git a/checkpoint-228/special_tokens_map.json b/checkpoint-228/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-228/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-228/tokenizer.json b/checkpoint-228/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-228/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-228/tokenizer_config.json b/checkpoint-228/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-228/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-228/trainer_state.json b/checkpoint-228/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4b145afc6ada1f7d4c584b2465ea96787544a67
--- /dev/null
+++ b/checkpoint-228/trainer_state.json
@@ -0,0 +1,1629 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.0,
+ "eval_steps": 500,
+ "global_step": 228,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ },
+ {
+ "epoch": 1.0087719298245614,
+ "grad_norm": 30.44986343383789,
+ "learning_rate": 4.991865498871647e-06,
+ "loss": 0.2606,
+ "step": 115
+ },
+ {
+ "epoch": 1.0175438596491229,
+ "grad_norm": 14.421515464782715,
+ "learning_rate": 4.99074543733652e-06,
+ "loss": 0.2394,
+ "step": 116
+ },
+ {
+ "epoch": 1.0263157894736843,
+ "grad_norm": 14.072005271911621,
+ "learning_rate": 4.989553297864489e-06,
+ "loss": 0.2288,
+ "step": 117
+ },
+ {
+ "epoch": 1.0350877192982457,
+ "grad_norm": 4.395325660705566,
+ "learning_rate": 4.988289114954045e-06,
+ "loss": 0.2129,
+ "step": 118
+ },
+ {
+ "epoch": 1.043859649122807,
+ "grad_norm": 7.286703586578369,
+ "learning_rate": 4.986952925188489e-06,
+ "loss": 0.186,
+ "step": 119
+ },
+ {
+ "epoch": 1.0526315789473684,
+ "grad_norm": 8.332784652709961,
+ "learning_rate": 4.98554476723488e-06,
+ "loss": 0.178,
+ "step": 120
+ },
+ {
+ "epoch": 1.0614035087719298,
+ "grad_norm": 1.3646447658538818,
+ "learning_rate": 4.984064681842917e-06,
+ "loss": 0.1687,
+ "step": 121
+ },
+ {
+ "epoch": 1.0701754385964912,
+ "grad_norm": 4.494940757751465,
+ "learning_rate": 4.982512711843753e-06,
+ "loss": 0.1881,
+ "step": 122
+ },
+ {
+ "epoch": 1.0789473684210527,
+ "grad_norm": 3.3929836750030518,
+ "learning_rate": 4.980888902148757e-06,
+ "loss": 0.1764,
+ "step": 123
+ },
+ {
+ "epoch": 1.087719298245614,
+ "grad_norm": 1.8281155824661255,
+ "learning_rate": 4.979193299748225e-06,
+ "loss": 0.1602,
+ "step": 124
+ },
+ {
+ "epoch": 1.0964912280701755,
+ "grad_norm": 3.494239568710327,
+ "learning_rate": 4.977425953710005e-06,
+ "loss": 0.1729,
+ "step": 125
+ },
+ {
+ "epoch": 1.1052631578947367,
+ "grad_norm": 1.500410556793213,
+ "learning_rate": 4.975586915178084e-06,
+ "loss": 0.1666,
+ "step": 126
+ },
+ {
+ "epoch": 1.1140350877192982,
+ "grad_norm": 1.4680222272872925,
+ "learning_rate": 4.973676237371111e-06,
+ "loss": 0.159,
+ "step": 127
+ },
+ {
+ "epoch": 1.1228070175438596,
+ "grad_norm": 3.0383460521698,
+ "learning_rate": 4.971693975580851e-06,
+ "loss": 0.1484,
+ "step": 128
+ },
+ {
+ "epoch": 1.131578947368421,
+ "grad_norm": 3.74821138381958,
+ "learning_rate": 4.969640187170591e-06,
+ "loss": 0.1586,
+ "step": 129
+ },
+ {
+ "epoch": 1.1403508771929824,
+ "grad_norm": 4.682602405548096,
+ "learning_rate": 4.967514931573473e-06,
+ "loss": 0.1619,
+ "step": 130
+ },
+ {
+ "epoch": 1.1491228070175439,
+ "grad_norm": 3.90673565864563,
+ "learning_rate": 4.965318270290779e-06,
+ "loss": 0.164,
+ "step": 131
+ },
+ {
+ "epoch": 1.1578947368421053,
+ "grad_norm": 2.2017388343811035,
+ "learning_rate": 4.963050266890152e-06,
+ "loss": 0.1499,
+ "step": 132
+ },
+ {
+ "epoch": 1.1666666666666667,
+ "grad_norm": 2.4211816787719727,
+ "learning_rate": 4.960710987003753e-06,
+ "loss": 0.1387,
+ "step": 133
+ },
+ {
+ "epoch": 1.1754385964912282,
+ "grad_norm": 1.7753759622573853,
+ "learning_rate": 4.958300498326363e-06,
+ "loss": 0.1441,
+ "step": 134
+ },
+ {
+ "epoch": 1.1842105263157894,
+ "grad_norm": 1.5529910326004028,
+ "learning_rate": 4.955818870613425e-06,
+ "loss": 0.1304,
+ "step": 135
+ },
+ {
+ "epoch": 1.1929824561403508,
+ "grad_norm": 2.090593099594116,
+ "learning_rate": 4.953266175679023e-06,
+ "loss": 0.1419,
+ "step": 136
+ },
+ {
+ "epoch": 1.2017543859649122,
+ "grad_norm": 2.7141878604888916,
+ "learning_rate": 4.95064248739381e-06,
+ "loss": 0.1444,
+ "step": 137
+ },
+ {
+ "epoch": 1.2105263157894737,
+ "grad_norm": 2.3690481185913086,
+ "learning_rate": 4.947947881682861e-06,
+ "loss": 0.1383,
+ "step": 138
+ },
+ {
+ "epoch": 1.219298245614035,
+ "grad_norm": 2.2403147220611572,
+ "learning_rate": 4.945182436523482e-06,
+ "loss": 0.1418,
+ "step": 139
+ },
+ {
+ "epoch": 1.2280701754385965,
+ "grad_norm": 1.3939160108566284,
+ "learning_rate": 4.942346231942955e-06,
+ "loss": 0.1307,
+ "step": 140
+ },
+ {
+ "epoch": 1.236842105263158,
+ "grad_norm": 11.276732444763184,
+ "learning_rate": 4.939439350016214e-06,
+ "loss": 0.1397,
+ "step": 141
+ },
+ {
+ "epoch": 1.2456140350877192,
+ "grad_norm": 8.260516166687012,
+ "learning_rate": 4.9364618748634794e-06,
+ "loss": 0.1426,
+ "step": 142
+ },
+ {
+ "epoch": 1.2543859649122808,
+ "grad_norm": 2.09720516204834,
+ "learning_rate": 4.933413892647819e-06,
+ "loss": 0.1323,
+ "step": 143
+ },
+ {
+ "epoch": 1.263157894736842,
+ "grad_norm": 1.802125334739685,
+ "learning_rate": 4.9302954915726535e-06,
+ "loss": 0.1304,
+ "step": 144
+ },
+ {
+ "epoch": 1.2719298245614035,
+ "grad_norm": 1.7151471376419067,
+ "learning_rate": 4.927106761879207e-06,
+ "loss": 0.1264,
+ "step": 145
+ },
+ {
+ "epoch": 1.280701754385965,
+ "grad_norm": 1.6970336437225342,
+ "learning_rate": 4.923847795843894e-06,
+ "loss": 0.1227,
+ "step": 146
+ },
+ {
+ "epoch": 1.2894736842105263,
+ "grad_norm": 16.60441017150879,
+ "learning_rate": 4.920518687775647e-06,
+ "loss": 0.1606,
+ "step": 147
+ },
+ {
+ "epoch": 1.2982456140350878,
+ "grad_norm": 6.470354080200195,
+ "learning_rate": 4.917119534013194e-06,
+ "loss": 0.1447,
+ "step": 148
+ },
+ {
+ "epoch": 1.3070175438596492,
+ "grad_norm": 1.4908231496810913,
+ "learning_rate": 4.913650432922264e-06,
+ "loss": 0.1343,
+ "step": 149
+ },
+ {
+ "epoch": 1.3157894736842106,
+ "grad_norm": 3.19964861869812,
+ "learning_rate": 4.91011148489274e-06,
+ "loss": 0.1354,
+ "step": 150
+ },
+ {
+ "epoch": 1.3245614035087718,
+ "grad_norm": 2.6052839756011963,
+ "learning_rate": 4.906502792335761e-06,
+ "loss": 0.1342,
+ "step": 151
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.0719165802001953,
+ "learning_rate": 4.9028244596807525e-06,
+ "loss": 0.1359,
+ "step": 152
+ },
+ {
+ "epoch": 1.3421052631578947,
+ "grad_norm": 0.8086919784545898,
+ "learning_rate": 4.899076593372405e-06,
+ "loss": 0.1279,
+ "step": 153
+ },
+ {
+ "epoch": 1.3508771929824561,
+ "grad_norm": 1.0056848526000977,
+ "learning_rate": 4.8952593018675955e-06,
+ "loss": 0.1162,
+ "step": 154
+ },
+ {
+ "epoch": 1.3596491228070176,
+ "grad_norm": 5.72553014755249,
+ "learning_rate": 4.891372695632249e-06,
+ "loss": 0.1315,
+ "step": 155
+ },
+ {
+ "epoch": 1.368421052631579,
+ "grad_norm": 1.522894024848938,
+ "learning_rate": 4.887416887138139e-06,
+ "loss": 0.1266,
+ "step": 156
+ },
+ {
+ "epoch": 1.3771929824561404,
+ "grad_norm": 2.019472122192383,
+ "learning_rate": 4.883391990859635e-06,
+ "loss": 0.1262,
+ "step": 157
+ },
+ {
+ "epoch": 1.3859649122807016,
+ "grad_norm": 1.8594422340393066,
+ "learning_rate": 4.879298123270391e-06,
+ "loss": 0.125,
+ "step": 158
+ },
+ {
+ "epoch": 1.3947368421052633,
+ "grad_norm": 1.365377426147461,
+ "learning_rate": 4.8751354028399725e-06,
+ "loss": 0.1218,
+ "step": 159
+ },
+ {
+ "epoch": 1.4035087719298245,
+ "grad_norm": 3.553309917449951,
+ "learning_rate": 4.870903950030429e-06,
+ "loss": 0.1272,
+ "step": 160
+ },
+ {
+ "epoch": 1.412280701754386,
+ "grad_norm": 2.1770920753479004,
+ "learning_rate": 4.866603887292809e-06,
+ "loss": 0.1213,
+ "step": 161
+ },
+ {
+ "epoch": 1.4210526315789473,
+ "grad_norm": 1.6058955192565918,
+ "learning_rate": 4.862235339063613e-06,
+ "loss": 0.1173,
+ "step": 162
+ },
+ {
+ "epoch": 1.4298245614035088,
+ "grad_norm": 1.3208314180374146,
+ "learning_rate": 4.857798431761199e-06,
+ "loss": 0.1183,
+ "step": 163
+ },
+ {
+ "epoch": 1.4385964912280702,
+ "grad_norm": 1.282729983329773,
+ "learning_rate": 4.853293293782118e-06,
+ "loss": 0.1209,
+ "step": 164
+ },
+ {
+ "epoch": 1.4473684210526316,
+ "grad_norm": 1.3838152885437012,
+ "learning_rate": 4.848720055497401e-06,
+ "loss": 0.1198,
+ "step": 165
+ },
+ {
+ "epoch": 1.456140350877193,
+ "grad_norm": 1.2930737733840942,
+ "learning_rate": 4.844078849248785e-06,
+ "loss": 0.1268,
+ "step": 166
+ },
+ {
+ "epoch": 1.4649122807017543,
+ "grad_norm": 1.7022266387939453,
+ "learning_rate": 4.839369809344888e-06,
+ "loss": 0.1198,
+ "step": 167
+ },
+ {
+ "epoch": 1.4736842105263157,
+ "grad_norm": 1.0927815437316895,
+ "learning_rate": 4.834593072057313e-06,
+ "loss": 0.1132,
+ "step": 168
+ },
+ {
+ "epoch": 1.4824561403508771,
+ "grad_norm": 0.9326333999633789,
+ "learning_rate": 4.829748775616716e-06,
+ "loss": 0.1193,
+ "step": 169
+ },
+ {
+ "epoch": 1.4912280701754386,
+ "grad_norm": 1.3564742803573608,
+ "learning_rate": 4.8248370602087954e-06,
+ "loss": 0.118,
+ "step": 170
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 1.19778573513031,
+ "learning_rate": 4.819858067970243e-06,
+ "loss": 0.1122,
+ "step": 171
+ },
+ {
+ "epoch": 1.5087719298245614,
+ "grad_norm": 2.8438351154327393,
+ "learning_rate": 4.814811942984625e-06,
+ "loss": 0.1217,
+ "step": 172
+ },
+ {
+ "epoch": 1.5175438596491229,
+ "grad_norm": 1.0701063871383667,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.1114,
+ "step": 173
+ },
+ {
+ "epoch": 1.526315789473684,
+ "grad_norm": 0.9053553938865662,
+ "learning_rate": 4.804518880815776e-06,
+ "loss": 0.1178,
+ "step": 174
+ },
+ {
+ "epoch": 1.5350877192982457,
+ "grad_norm": 0.42274603247642517,
+ "learning_rate": 4.799272241496259e-06,
+ "loss": 0.1091,
+ "step": 175
+ },
+ {
+ "epoch": 1.543859649122807,
+ "grad_norm": 0.8576470017433167,
+ "learning_rate": 4.793959065148484e-06,
+ "loss": 0.1134,
+ "step": 176
+ },
+ {
+ "epoch": 1.5526315789473686,
+ "grad_norm": 0.5910662412643433,
+ "learning_rate": 4.78857950552674e-06,
+ "loss": 0.1148,
+ "step": 177
+ },
+ {
+ "epoch": 1.5614035087719298,
+ "grad_norm": 0.8761632442474365,
+ "learning_rate": 4.783133718306331e-06,
+ "loss": 0.1125,
+ "step": 178
+ },
+ {
+ "epoch": 1.5701754385964912,
+ "grad_norm": 1.9190795421600342,
+ "learning_rate": 4.777621861079079e-06,
+ "loss": 0.1148,
+ "step": 179
+ },
+ {
+ "epoch": 1.5789473684210527,
+ "grad_norm": 0.6199957728385925,
+ "learning_rate": 4.772044093348757e-06,
+ "loss": 0.1097,
+ "step": 180
+ },
+ {
+ "epoch": 1.587719298245614,
+ "grad_norm": 1.562089443206787,
+ "learning_rate": 4.766400576526479e-06,
+ "loss": 0.1097,
+ "step": 181
+ },
+ {
+ "epoch": 1.5964912280701755,
+ "grad_norm": 1.4957091808319092,
+ "learning_rate": 4.760691473926021e-06,
+ "loss": 0.1216,
+ "step": 182
+ },
+ {
+ "epoch": 1.6052631578947367,
+ "grad_norm": 0.9863570332527161,
+ "learning_rate": 4.754916950759105e-06,
+ "loss": 0.1122,
+ "step": 183
+ },
+ {
+ "epoch": 1.6140350877192984,
+ "grad_norm": 0.5803346633911133,
+ "learning_rate": 4.749077174130609e-06,
+ "loss": 0.1103,
+ "step": 184
+ },
+ {
+ "epoch": 1.6228070175438596,
+ "grad_norm": 1.8789891004562378,
+ "learning_rate": 4.743172313033738e-06,
+ "loss": 0.1191,
+ "step": 185
+ },
+ {
+ "epoch": 1.631578947368421,
+ "grad_norm": 0.8731380105018616,
+ "learning_rate": 4.7372025383451285e-06,
+ "loss": 0.1154,
+ "step": 186
+ },
+ {
+ "epoch": 1.6403508771929824,
+ "grad_norm": 1.3535627126693726,
+ "learning_rate": 4.7311680228199075e-06,
+ "loss": 0.1123,
+ "step": 187
+ },
+ {
+ "epoch": 1.6491228070175439,
+ "grad_norm": 0.7211089134216309,
+ "learning_rate": 4.725068941086693e-06,
+ "loss": 0.1134,
+ "step": 188
+ },
+ {
+ "epoch": 1.6578947368421053,
+ "grad_norm": 1.4752328395843506,
+ "learning_rate": 4.718905469642534e-06,
+ "loss": 0.1185,
+ "step": 189
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 0.9822680354118347,
+ "learning_rate": 4.712677786847814e-06,
+ "loss": 0.1146,
+ "step": 190
+ },
+ {
+ "epoch": 1.6754385964912282,
+ "grad_norm": 1.1308330297470093,
+ "learning_rate": 4.706386072921083e-06,
+ "loss": 0.1061,
+ "step": 191
+ },
+ {
+ "epoch": 1.6842105263157894,
+ "grad_norm": 5.331939697265625,
+ "learning_rate": 4.70003050993384e-06,
+ "loss": 0.1153,
+ "step": 192
+ },
+ {
+ "epoch": 1.692982456140351,
+ "grad_norm": 0.6911673545837402,
+ "learning_rate": 4.6936112818052674e-06,
+ "loss": 0.1098,
+ "step": 193
+ },
+ {
+ "epoch": 1.7017543859649122,
+ "grad_norm": 0.5160980224609375,
+ "learning_rate": 4.687128574296912e-06,
+ "loss": 0.1073,
+ "step": 194
+ },
+ {
+ "epoch": 1.7105263157894737,
+ "grad_norm": 1.5724798440933228,
+ "learning_rate": 4.680582575007303e-06,
+ "loss": 0.121,
+ "step": 195
+ },
+ {
+ "epoch": 1.719298245614035,
+ "grad_norm": 1.3960011005401611,
+ "learning_rate": 4.6739734733665275e-06,
+ "loss": 0.1145,
+ "step": 196
+ },
+ {
+ "epoch": 1.7280701754385965,
+ "grad_norm": 1.4949183464050293,
+ "learning_rate": 4.6673014606307465e-06,
+ "loss": 0.1166,
+ "step": 197
+ },
+ {
+ "epoch": 1.736842105263158,
+ "grad_norm": 1.6873422861099243,
+ "learning_rate": 4.660566729876661e-06,
+ "loss": 0.1115,
+ "step": 198
+ },
+ {
+ "epoch": 1.7456140350877192,
+ "grad_norm": 1.3443641662597656,
+ "learning_rate": 4.653769475995926e-06,
+ "loss": 0.1119,
+ "step": 199
+ },
+ {
+ "epoch": 1.7543859649122808,
+ "grad_norm": 0.807525098323822,
+ "learning_rate": 4.646909895689508e-06,
+ "loss": 0.1059,
+ "step": 200
+ },
+ {
+ "epoch": 1.763157894736842,
+ "grad_norm": 1.589316964149475,
+ "learning_rate": 4.639988187461995e-06,
+ "loss": 0.1151,
+ "step": 201
+ },
+ {
+ "epoch": 1.7719298245614035,
+ "grad_norm": 2.474756956100464,
+ "learning_rate": 4.633004551615851e-06,
+ "loss": 0.116,
+ "step": 202
+ },
+ {
+ "epoch": 1.780701754385965,
+ "grad_norm": 0.6210195422172546,
+ "learning_rate": 4.62595919024562e-06,
+ "loss": 0.1097,
+ "step": 203
+ },
+ {
+ "epoch": 1.7894736842105263,
+ "grad_norm": 0.7217905521392822,
+ "learning_rate": 4.618852307232078e-06,
+ "loss": 0.1117,
+ "step": 204
+ },
+ {
+ "epoch": 1.7982456140350878,
+ "grad_norm": 1.551251769065857,
+ "learning_rate": 4.611684108236334e-06,
+ "loss": 0.113,
+ "step": 205
+ },
+ {
+ "epoch": 1.807017543859649,
+ "grad_norm": 0.6619828939437866,
+ "learning_rate": 4.604454800693874e-06,
+ "loss": 0.113,
+ "step": 206
+ },
+ {
+ "epoch": 1.8157894736842106,
+ "grad_norm": 0.9461805820465088,
+ "learning_rate": 4.597164593808564e-06,
+ "loss": 0.1093,
+ "step": 207
+ },
+ {
+ "epoch": 1.8245614035087718,
+ "grad_norm": 1.2926547527313232,
+ "learning_rate": 4.589813698546592e-06,
+ "loss": 0.1128,
+ "step": 208
+ },
+ {
+ "epoch": 1.8333333333333335,
+ "grad_norm": 0.8754212856292725,
+ "learning_rate": 4.582402327630368e-06,
+ "loss": 0.1104,
+ "step": 209
+ },
+ {
+ "epoch": 1.8421052631578947,
+ "grad_norm": 0.846051812171936,
+ "learning_rate": 4.574930695532357e-06,
+ "loss": 0.1105,
+ "step": 210
+ },
+ {
+ "epoch": 1.8508771929824561,
+ "grad_norm": 1.3332515954971313,
+ "learning_rate": 4.567399018468889e-06,
+ "loss": 0.1101,
+ "step": 211
+ },
+ {
+ "epoch": 1.8596491228070176,
+ "grad_norm": 0.8729192614555359,
+ "learning_rate": 4.5598075143938855e-06,
+ "loss": 0.1081,
+ "step": 212
+ },
+ {
+ "epoch": 1.868421052631579,
+ "grad_norm": 0.8618345260620117,
+ "learning_rate": 4.552156402992567e-06,
+ "loss": 0.1059,
+ "step": 213
+ },
+ {
+ "epoch": 1.8771929824561404,
+ "grad_norm": 1.2135930061340332,
+ "learning_rate": 4.544445905675082e-06,
+ "loss": 0.1105,
+ "step": 214
+ },
+ {
+ "epoch": 1.8859649122807016,
+ "grad_norm": 0.8405666351318359,
+ "learning_rate": 4.536676245570111e-06,
+ "loss": 0.1118,
+ "step": 215
+ },
+ {
+ "epoch": 1.8947368421052633,
+ "grad_norm": 0.42860639095306396,
+ "learning_rate": 4.528847647518403e-06,
+ "loss": 0.1093,
+ "step": 216
+ },
+ {
+ "epoch": 1.9035087719298245,
+ "grad_norm": 1.1538206338882446,
+ "learning_rate": 4.520960338066271e-06,
+ "loss": 0.1088,
+ "step": 217
+ },
+ {
+ "epoch": 1.912280701754386,
+ "grad_norm": 0.5870749354362488,
+ "learning_rate": 4.513014545459038e-06,
+ "loss": 0.1061,
+ "step": 218
+ },
+ {
+ "epoch": 1.9210526315789473,
+ "grad_norm": 0.7279748916625977,
+ "learning_rate": 4.505010499634427e-06,
+ "loss": 0.1032,
+ "step": 219
+ },
+ {
+ "epoch": 1.9298245614035088,
+ "grad_norm": 0.6331414580345154,
+ "learning_rate": 4.4969484322159125e-06,
+ "loss": 0.1109,
+ "step": 220
+ },
+ {
+ "epoch": 1.9385964912280702,
+ "grad_norm": 0.9024543166160583,
+ "learning_rate": 4.488828576506014e-06,
+ "loss": 0.1094,
+ "step": 221
+ },
+ {
+ "epoch": 1.9473684210526314,
+ "grad_norm": 3.540376901626587,
+ "learning_rate": 4.480651167479545e-06,
+ "loss": 0.1154,
+ "step": 222
+ },
+ {
+ "epoch": 1.956140350877193,
+ "grad_norm": 0.9506739377975464,
+ "learning_rate": 4.472416441776817e-06,
+ "loss": 0.108,
+ "step": 223
+ },
+ {
+ "epoch": 1.9649122807017543,
+ "grad_norm": 0.6585081815719604,
+ "learning_rate": 4.464124637696786e-06,
+ "loss": 0.1033,
+ "step": 224
+ },
+ {
+ "epoch": 1.973684210526316,
+ "grad_norm": 1.143038034439087,
+ "learning_rate": 4.455775995190161e-06,
+ "loss": 0.1092,
+ "step": 225
+ },
+ {
+ "epoch": 1.9824561403508771,
+ "grad_norm": 1.148261547088623,
+ "learning_rate": 4.4473707558524555e-06,
+ "loss": 0.1076,
+ "step": 226
+ },
+ {
+ "epoch": 1.9912280701754386,
+ "grad_norm": 0.7375811338424683,
+ "learning_rate": 4.438909162917003e-06,
+ "loss": 0.108,
+ "step": 227
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.5254591703414917,
+ "learning_rate": 4.430391461247911e-06,
+ "loss": 0.1079,
+ "step": 228
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.91998014829704e+18,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-228/training_args.bin b/checkpoint-228/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-228/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-228/zero_to_fp32.py b/checkpoint-228/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-228/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-342/README.md b/checkpoint-342/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-342/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-342/adapter_config.json b/checkpoint-342/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-342/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-342/adapter_model.safetensors b/checkpoint-342/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..71b98dd552aafec16a39e9cc46bf6dce29a76e35
--- /dev/null
+++ b/checkpoint-342/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52db36c2e6cb4cc680eba88475f97c12ef838ab20fcdeb613dada0e649dacc33
+size 9016826528
diff --git a/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..70ef0d03cd0cb866abbc958a3726be6d37928544
--- /dev/null
+++ b/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a50d0ceb927cbc242628a18b8ea961fd93acc184410f4f156e60aaa6269580
+size 27050164444
diff --git a/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c163c88be363e8cbaebe6d6a0db05641f5175ca
--- /dev/null
+++ b/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:017dce42a9b2359d7b2aa38ef8c0c032092cf71f07d0bf4a3759737a9cdbe71f
+size 27050169884
diff --git a/checkpoint-342/global_step342/mp_rank_00_model_states.pt b/checkpoint-342/global_step342/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..040fab0a231893ace089247e1bb3f85f57e0c661
--- /dev/null
+++ b/checkpoint-342/global_step342/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f2b430da1dda443608e385aaef5d7aa3763e73b3695ed2e293eacf8e7009fed
+size 9776788601
diff --git a/checkpoint-342/latest b/checkpoint-342/latest
new file mode 100644
index 0000000000000000000000000000000000000000..c865948ad34ed67e3b6a2d0505df96492e4bcc82
--- /dev/null
+++ b/checkpoint-342/latest
@@ -0,0 +1 @@
+global_step342
\ No newline at end of file
diff --git a/checkpoint-342/rng_state_0.pth b/checkpoint-342/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3c437a4fe2d46cd991229eb636f65c53484183ee
--- /dev/null
+++ b/checkpoint-342/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78ece40f5d1a720cb25857302767813fd74736b0b26d2e81bc81a7aad3a91d1c
+size 14512
diff --git a/checkpoint-342/rng_state_1.pth b/checkpoint-342/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d858d57424a138ea07f769788e2868d8cbd1b1e7
--- /dev/null
+++ b/checkpoint-342/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3acd28755d11d91e6050ba9d039c96c56fa63aa16b6394139525740a1c647f23
+size 14512
diff --git a/checkpoint-342/scheduler.pt b/checkpoint-342/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d33e9c5460dd802e989edef8d2c82a0237bcc7c
--- /dev/null
+++ b/checkpoint-342/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7651f6a2b549eb3de066bc2352d6529046a6983d2871e2b4d4fb602cb7961725
+size 1064
diff --git a/checkpoint-342/special_tokens_map.json b/checkpoint-342/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-342/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-342/tokenizer.json b/checkpoint-342/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-342/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-342/tokenizer_config.json b/checkpoint-342/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-342/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-342/trainer_state.json b/checkpoint-342/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8b4161f89c5188c42309dd8b6e2cfcb53138aa9
--- /dev/null
+++ b/checkpoint-342/trainer_state.json
@@ -0,0 +1,2427 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.0,
+ "eval_steps": 500,
+ "global_step": 342,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ },
+ {
+ "epoch": 1.0087719298245614,
+ "grad_norm": 30.44986343383789,
+ "learning_rate": 4.991865498871647e-06,
+ "loss": 0.2606,
+ "step": 115
+ },
+ {
+ "epoch": 1.0175438596491229,
+ "grad_norm": 14.421515464782715,
+ "learning_rate": 4.99074543733652e-06,
+ "loss": 0.2394,
+ "step": 116
+ },
+ {
+ "epoch": 1.0263157894736843,
+ "grad_norm": 14.072005271911621,
+ "learning_rate": 4.989553297864489e-06,
+ "loss": 0.2288,
+ "step": 117
+ },
+ {
+ "epoch": 1.0350877192982457,
+ "grad_norm": 4.395325660705566,
+ "learning_rate": 4.988289114954045e-06,
+ "loss": 0.2129,
+ "step": 118
+ },
+ {
+ "epoch": 1.043859649122807,
+ "grad_norm": 7.286703586578369,
+ "learning_rate": 4.986952925188489e-06,
+ "loss": 0.186,
+ "step": 119
+ },
+ {
+ "epoch": 1.0526315789473684,
+ "grad_norm": 8.332784652709961,
+ "learning_rate": 4.98554476723488e-06,
+ "loss": 0.178,
+ "step": 120
+ },
+ {
+ "epoch": 1.0614035087719298,
+ "grad_norm": 1.3646447658538818,
+ "learning_rate": 4.984064681842917e-06,
+ "loss": 0.1687,
+ "step": 121
+ },
+ {
+ "epoch": 1.0701754385964912,
+ "grad_norm": 4.494940757751465,
+ "learning_rate": 4.982512711843753e-06,
+ "loss": 0.1881,
+ "step": 122
+ },
+ {
+ "epoch": 1.0789473684210527,
+ "grad_norm": 3.3929836750030518,
+ "learning_rate": 4.980888902148757e-06,
+ "loss": 0.1764,
+ "step": 123
+ },
+ {
+ "epoch": 1.087719298245614,
+ "grad_norm": 1.8281155824661255,
+ "learning_rate": 4.979193299748225e-06,
+ "loss": 0.1602,
+ "step": 124
+ },
+ {
+ "epoch": 1.0964912280701755,
+ "grad_norm": 3.494239568710327,
+ "learning_rate": 4.977425953710005e-06,
+ "loss": 0.1729,
+ "step": 125
+ },
+ {
+ "epoch": 1.1052631578947367,
+ "grad_norm": 1.500410556793213,
+ "learning_rate": 4.975586915178084e-06,
+ "loss": 0.1666,
+ "step": 126
+ },
+ {
+ "epoch": 1.1140350877192982,
+ "grad_norm": 1.4680222272872925,
+ "learning_rate": 4.973676237371111e-06,
+ "loss": 0.159,
+ "step": 127
+ },
+ {
+ "epoch": 1.1228070175438596,
+ "grad_norm": 3.0383460521698,
+ "learning_rate": 4.971693975580851e-06,
+ "loss": 0.1484,
+ "step": 128
+ },
+ {
+ "epoch": 1.131578947368421,
+ "grad_norm": 3.74821138381958,
+ "learning_rate": 4.969640187170591e-06,
+ "loss": 0.1586,
+ "step": 129
+ },
+ {
+ "epoch": 1.1403508771929824,
+ "grad_norm": 4.682602405548096,
+ "learning_rate": 4.967514931573473e-06,
+ "loss": 0.1619,
+ "step": 130
+ },
+ {
+ "epoch": 1.1491228070175439,
+ "grad_norm": 3.90673565864563,
+ "learning_rate": 4.965318270290779e-06,
+ "loss": 0.164,
+ "step": 131
+ },
+ {
+ "epoch": 1.1578947368421053,
+ "grad_norm": 2.2017388343811035,
+ "learning_rate": 4.963050266890152e-06,
+ "loss": 0.1499,
+ "step": 132
+ },
+ {
+ "epoch": 1.1666666666666667,
+ "grad_norm": 2.4211816787719727,
+ "learning_rate": 4.960710987003753e-06,
+ "loss": 0.1387,
+ "step": 133
+ },
+ {
+ "epoch": 1.1754385964912282,
+ "grad_norm": 1.7753759622573853,
+ "learning_rate": 4.958300498326363e-06,
+ "loss": 0.1441,
+ "step": 134
+ },
+ {
+ "epoch": 1.1842105263157894,
+ "grad_norm": 1.5529910326004028,
+ "learning_rate": 4.955818870613425e-06,
+ "loss": 0.1304,
+ "step": 135
+ },
+ {
+ "epoch": 1.1929824561403508,
+ "grad_norm": 2.090593099594116,
+ "learning_rate": 4.953266175679023e-06,
+ "loss": 0.1419,
+ "step": 136
+ },
+ {
+ "epoch": 1.2017543859649122,
+ "grad_norm": 2.7141878604888916,
+ "learning_rate": 4.95064248739381e-06,
+ "loss": 0.1444,
+ "step": 137
+ },
+ {
+ "epoch": 1.2105263157894737,
+ "grad_norm": 2.3690481185913086,
+ "learning_rate": 4.947947881682861e-06,
+ "loss": 0.1383,
+ "step": 138
+ },
+ {
+ "epoch": 1.219298245614035,
+ "grad_norm": 2.2403147220611572,
+ "learning_rate": 4.945182436523482e-06,
+ "loss": 0.1418,
+ "step": 139
+ },
+ {
+ "epoch": 1.2280701754385965,
+ "grad_norm": 1.3939160108566284,
+ "learning_rate": 4.942346231942955e-06,
+ "loss": 0.1307,
+ "step": 140
+ },
+ {
+ "epoch": 1.236842105263158,
+ "grad_norm": 11.276732444763184,
+ "learning_rate": 4.939439350016214e-06,
+ "loss": 0.1397,
+ "step": 141
+ },
+ {
+ "epoch": 1.2456140350877192,
+ "grad_norm": 8.260516166687012,
+ "learning_rate": 4.9364618748634794e-06,
+ "loss": 0.1426,
+ "step": 142
+ },
+ {
+ "epoch": 1.2543859649122808,
+ "grad_norm": 2.09720516204834,
+ "learning_rate": 4.933413892647819e-06,
+ "loss": 0.1323,
+ "step": 143
+ },
+ {
+ "epoch": 1.263157894736842,
+ "grad_norm": 1.802125334739685,
+ "learning_rate": 4.9302954915726535e-06,
+ "loss": 0.1304,
+ "step": 144
+ },
+ {
+ "epoch": 1.2719298245614035,
+ "grad_norm": 1.7151471376419067,
+ "learning_rate": 4.927106761879207e-06,
+ "loss": 0.1264,
+ "step": 145
+ },
+ {
+ "epoch": 1.280701754385965,
+ "grad_norm": 1.6970336437225342,
+ "learning_rate": 4.923847795843894e-06,
+ "loss": 0.1227,
+ "step": 146
+ },
+ {
+ "epoch": 1.2894736842105263,
+ "grad_norm": 16.60441017150879,
+ "learning_rate": 4.920518687775647e-06,
+ "loss": 0.1606,
+ "step": 147
+ },
+ {
+ "epoch": 1.2982456140350878,
+ "grad_norm": 6.470354080200195,
+ "learning_rate": 4.917119534013194e-06,
+ "loss": 0.1447,
+ "step": 148
+ },
+ {
+ "epoch": 1.3070175438596492,
+ "grad_norm": 1.4908231496810913,
+ "learning_rate": 4.913650432922264e-06,
+ "loss": 0.1343,
+ "step": 149
+ },
+ {
+ "epoch": 1.3157894736842106,
+ "grad_norm": 3.19964861869812,
+ "learning_rate": 4.91011148489274e-06,
+ "loss": 0.1354,
+ "step": 150
+ },
+ {
+ "epoch": 1.3245614035087718,
+ "grad_norm": 2.6052839756011963,
+ "learning_rate": 4.906502792335761e-06,
+ "loss": 0.1342,
+ "step": 151
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.0719165802001953,
+ "learning_rate": 4.9028244596807525e-06,
+ "loss": 0.1359,
+ "step": 152
+ },
+ {
+ "epoch": 1.3421052631578947,
+ "grad_norm": 0.8086919784545898,
+ "learning_rate": 4.899076593372405e-06,
+ "loss": 0.1279,
+ "step": 153
+ },
+ {
+ "epoch": 1.3508771929824561,
+ "grad_norm": 1.0056848526000977,
+ "learning_rate": 4.8952593018675955e-06,
+ "loss": 0.1162,
+ "step": 154
+ },
+ {
+ "epoch": 1.3596491228070176,
+ "grad_norm": 5.72553014755249,
+ "learning_rate": 4.891372695632249e-06,
+ "loss": 0.1315,
+ "step": 155
+ },
+ {
+ "epoch": 1.368421052631579,
+ "grad_norm": 1.522894024848938,
+ "learning_rate": 4.887416887138139e-06,
+ "loss": 0.1266,
+ "step": 156
+ },
+ {
+ "epoch": 1.3771929824561404,
+ "grad_norm": 2.019472122192383,
+ "learning_rate": 4.883391990859635e-06,
+ "loss": 0.1262,
+ "step": 157
+ },
+ {
+ "epoch": 1.3859649122807016,
+ "grad_norm": 1.8594422340393066,
+ "learning_rate": 4.879298123270391e-06,
+ "loss": 0.125,
+ "step": 158
+ },
+ {
+ "epoch": 1.3947368421052633,
+ "grad_norm": 1.365377426147461,
+ "learning_rate": 4.8751354028399725e-06,
+ "loss": 0.1218,
+ "step": 159
+ },
+ {
+ "epoch": 1.4035087719298245,
+ "grad_norm": 3.553309917449951,
+ "learning_rate": 4.870903950030429e-06,
+ "loss": 0.1272,
+ "step": 160
+ },
+ {
+ "epoch": 1.412280701754386,
+ "grad_norm": 2.1770920753479004,
+ "learning_rate": 4.866603887292809e-06,
+ "loss": 0.1213,
+ "step": 161
+ },
+ {
+ "epoch": 1.4210526315789473,
+ "grad_norm": 1.6058955192565918,
+ "learning_rate": 4.862235339063613e-06,
+ "loss": 0.1173,
+ "step": 162
+ },
+ {
+ "epoch": 1.4298245614035088,
+ "grad_norm": 1.3208314180374146,
+ "learning_rate": 4.857798431761199e-06,
+ "loss": 0.1183,
+ "step": 163
+ },
+ {
+ "epoch": 1.4385964912280702,
+ "grad_norm": 1.282729983329773,
+ "learning_rate": 4.853293293782118e-06,
+ "loss": 0.1209,
+ "step": 164
+ },
+ {
+ "epoch": 1.4473684210526316,
+ "grad_norm": 1.3838152885437012,
+ "learning_rate": 4.848720055497401e-06,
+ "loss": 0.1198,
+ "step": 165
+ },
+ {
+ "epoch": 1.456140350877193,
+ "grad_norm": 1.2930737733840942,
+ "learning_rate": 4.844078849248785e-06,
+ "loss": 0.1268,
+ "step": 166
+ },
+ {
+ "epoch": 1.4649122807017543,
+ "grad_norm": 1.7022266387939453,
+ "learning_rate": 4.839369809344888e-06,
+ "loss": 0.1198,
+ "step": 167
+ },
+ {
+ "epoch": 1.4736842105263157,
+ "grad_norm": 1.0927815437316895,
+ "learning_rate": 4.834593072057313e-06,
+ "loss": 0.1132,
+ "step": 168
+ },
+ {
+ "epoch": 1.4824561403508771,
+ "grad_norm": 0.9326333999633789,
+ "learning_rate": 4.829748775616716e-06,
+ "loss": 0.1193,
+ "step": 169
+ },
+ {
+ "epoch": 1.4912280701754386,
+ "grad_norm": 1.3564742803573608,
+ "learning_rate": 4.8248370602087954e-06,
+ "loss": 0.118,
+ "step": 170
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 1.19778573513031,
+ "learning_rate": 4.819858067970243e-06,
+ "loss": 0.1122,
+ "step": 171
+ },
+ {
+ "epoch": 1.5087719298245614,
+ "grad_norm": 2.8438351154327393,
+ "learning_rate": 4.814811942984625e-06,
+ "loss": 0.1217,
+ "step": 172
+ },
+ {
+ "epoch": 1.5175438596491229,
+ "grad_norm": 1.0701063871383667,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.1114,
+ "step": 173
+ },
+ {
+ "epoch": 1.526315789473684,
+ "grad_norm": 0.9053553938865662,
+ "learning_rate": 4.804518880815776e-06,
+ "loss": 0.1178,
+ "step": 174
+ },
+ {
+ "epoch": 1.5350877192982457,
+ "grad_norm": 0.42274603247642517,
+ "learning_rate": 4.799272241496259e-06,
+ "loss": 0.1091,
+ "step": 175
+ },
+ {
+ "epoch": 1.543859649122807,
+ "grad_norm": 0.8576470017433167,
+ "learning_rate": 4.793959065148484e-06,
+ "loss": 0.1134,
+ "step": 176
+ },
+ {
+ "epoch": 1.5526315789473686,
+ "grad_norm": 0.5910662412643433,
+ "learning_rate": 4.78857950552674e-06,
+ "loss": 0.1148,
+ "step": 177
+ },
+ {
+ "epoch": 1.5614035087719298,
+ "grad_norm": 0.8761632442474365,
+ "learning_rate": 4.783133718306331e-06,
+ "loss": 0.1125,
+ "step": 178
+ },
+ {
+ "epoch": 1.5701754385964912,
+ "grad_norm": 1.9190795421600342,
+ "learning_rate": 4.777621861079079e-06,
+ "loss": 0.1148,
+ "step": 179
+ },
+ {
+ "epoch": 1.5789473684210527,
+ "grad_norm": 0.6199957728385925,
+ "learning_rate": 4.772044093348757e-06,
+ "loss": 0.1097,
+ "step": 180
+ },
+ {
+ "epoch": 1.587719298245614,
+ "grad_norm": 1.562089443206787,
+ "learning_rate": 4.766400576526479e-06,
+ "loss": 0.1097,
+ "step": 181
+ },
+ {
+ "epoch": 1.5964912280701755,
+ "grad_norm": 1.4957091808319092,
+ "learning_rate": 4.760691473926021e-06,
+ "loss": 0.1216,
+ "step": 182
+ },
+ {
+ "epoch": 1.6052631578947367,
+ "grad_norm": 0.9863570332527161,
+ "learning_rate": 4.754916950759105e-06,
+ "loss": 0.1122,
+ "step": 183
+ },
+ {
+ "epoch": 1.6140350877192984,
+ "grad_norm": 0.5803346633911133,
+ "learning_rate": 4.749077174130609e-06,
+ "loss": 0.1103,
+ "step": 184
+ },
+ {
+ "epoch": 1.6228070175438596,
+ "grad_norm": 1.8789891004562378,
+ "learning_rate": 4.743172313033738e-06,
+ "loss": 0.1191,
+ "step": 185
+ },
+ {
+ "epoch": 1.631578947368421,
+ "grad_norm": 0.8731380105018616,
+ "learning_rate": 4.7372025383451285e-06,
+ "loss": 0.1154,
+ "step": 186
+ },
+ {
+ "epoch": 1.6403508771929824,
+ "grad_norm": 1.3535627126693726,
+ "learning_rate": 4.7311680228199075e-06,
+ "loss": 0.1123,
+ "step": 187
+ },
+ {
+ "epoch": 1.6491228070175439,
+ "grad_norm": 0.7211089134216309,
+ "learning_rate": 4.725068941086693e-06,
+ "loss": 0.1134,
+ "step": 188
+ },
+ {
+ "epoch": 1.6578947368421053,
+ "grad_norm": 1.4752328395843506,
+ "learning_rate": 4.718905469642534e-06,
+ "loss": 0.1185,
+ "step": 189
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 0.9822680354118347,
+ "learning_rate": 4.712677786847814e-06,
+ "loss": 0.1146,
+ "step": 190
+ },
+ {
+ "epoch": 1.6754385964912282,
+ "grad_norm": 1.1308330297470093,
+ "learning_rate": 4.706386072921083e-06,
+ "loss": 0.1061,
+ "step": 191
+ },
+ {
+ "epoch": 1.6842105263157894,
+ "grad_norm": 5.331939697265625,
+ "learning_rate": 4.70003050993384e-06,
+ "loss": 0.1153,
+ "step": 192
+ },
+ {
+ "epoch": 1.692982456140351,
+ "grad_norm": 0.6911673545837402,
+ "learning_rate": 4.6936112818052674e-06,
+ "loss": 0.1098,
+ "step": 193
+ },
+ {
+ "epoch": 1.7017543859649122,
+ "grad_norm": 0.5160980224609375,
+ "learning_rate": 4.687128574296912e-06,
+ "loss": 0.1073,
+ "step": 194
+ },
+ {
+ "epoch": 1.7105263157894737,
+ "grad_norm": 1.5724798440933228,
+ "learning_rate": 4.680582575007303e-06,
+ "loss": 0.121,
+ "step": 195
+ },
+ {
+ "epoch": 1.719298245614035,
+ "grad_norm": 1.3960011005401611,
+ "learning_rate": 4.6739734733665275e-06,
+ "loss": 0.1145,
+ "step": 196
+ },
+ {
+ "epoch": 1.7280701754385965,
+ "grad_norm": 1.4949183464050293,
+ "learning_rate": 4.6673014606307465e-06,
+ "loss": 0.1166,
+ "step": 197
+ },
+ {
+ "epoch": 1.736842105263158,
+ "grad_norm": 1.6873422861099243,
+ "learning_rate": 4.660566729876661e-06,
+ "loss": 0.1115,
+ "step": 198
+ },
+ {
+ "epoch": 1.7456140350877192,
+ "grad_norm": 1.3443641662597656,
+ "learning_rate": 4.653769475995926e-06,
+ "loss": 0.1119,
+ "step": 199
+ },
+ {
+ "epoch": 1.7543859649122808,
+ "grad_norm": 0.807525098323822,
+ "learning_rate": 4.646909895689508e-06,
+ "loss": 0.1059,
+ "step": 200
+ },
+ {
+ "epoch": 1.763157894736842,
+ "grad_norm": 1.589316964149475,
+ "learning_rate": 4.639988187461995e-06,
+ "loss": 0.1151,
+ "step": 201
+ },
+ {
+ "epoch": 1.7719298245614035,
+ "grad_norm": 2.474756956100464,
+ "learning_rate": 4.633004551615851e-06,
+ "loss": 0.116,
+ "step": 202
+ },
+ {
+ "epoch": 1.780701754385965,
+ "grad_norm": 0.6210195422172546,
+ "learning_rate": 4.62595919024562e-06,
+ "loss": 0.1097,
+ "step": 203
+ },
+ {
+ "epoch": 1.7894736842105263,
+ "grad_norm": 0.7217905521392822,
+ "learning_rate": 4.618852307232078e-06,
+ "loss": 0.1117,
+ "step": 204
+ },
+ {
+ "epoch": 1.7982456140350878,
+ "grad_norm": 1.551251769065857,
+ "learning_rate": 4.611684108236334e-06,
+ "loss": 0.113,
+ "step": 205
+ },
+ {
+ "epoch": 1.807017543859649,
+ "grad_norm": 0.6619828939437866,
+ "learning_rate": 4.604454800693874e-06,
+ "loss": 0.113,
+ "step": 206
+ },
+ {
+ "epoch": 1.8157894736842106,
+ "grad_norm": 0.9461805820465088,
+ "learning_rate": 4.597164593808564e-06,
+ "loss": 0.1093,
+ "step": 207
+ },
+ {
+ "epoch": 1.8245614035087718,
+ "grad_norm": 1.2926547527313232,
+ "learning_rate": 4.589813698546592e-06,
+ "loss": 0.1128,
+ "step": 208
+ },
+ {
+ "epoch": 1.8333333333333335,
+ "grad_norm": 0.8754212856292725,
+ "learning_rate": 4.582402327630368e-06,
+ "loss": 0.1104,
+ "step": 209
+ },
+ {
+ "epoch": 1.8421052631578947,
+ "grad_norm": 0.846051812171936,
+ "learning_rate": 4.574930695532357e-06,
+ "loss": 0.1105,
+ "step": 210
+ },
+ {
+ "epoch": 1.8508771929824561,
+ "grad_norm": 1.3332515954971313,
+ "learning_rate": 4.567399018468889e-06,
+ "loss": 0.1101,
+ "step": 211
+ },
+ {
+ "epoch": 1.8596491228070176,
+ "grad_norm": 0.8729192614555359,
+ "learning_rate": 4.5598075143938855e-06,
+ "loss": 0.1081,
+ "step": 212
+ },
+ {
+ "epoch": 1.868421052631579,
+ "grad_norm": 0.8618345260620117,
+ "learning_rate": 4.552156402992567e-06,
+ "loss": 0.1059,
+ "step": 213
+ },
+ {
+ "epoch": 1.8771929824561404,
+ "grad_norm": 1.2135930061340332,
+ "learning_rate": 4.544445905675082e-06,
+ "loss": 0.1105,
+ "step": 214
+ },
+ {
+ "epoch": 1.8859649122807016,
+ "grad_norm": 0.8405666351318359,
+ "learning_rate": 4.536676245570111e-06,
+ "loss": 0.1118,
+ "step": 215
+ },
+ {
+ "epoch": 1.8947368421052633,
+ "grad_norm": 0.42860639095306396,
+ "learning_rate": 4.528847647518403e-06,
+ "loss": 0.1093,
+ "step": 216
+ },
+ {
+ "epoch": 1.9035087719298245,
+ "grad_norm": 1.1538206338882446,
+ "learning_rate": 4.520960338066271e-06,
+ "loss": 0.1088,
+ "step": 217
+ },
+ {
+ "epoch": 1.912280701754386,
+ "grad_norm": 0.5870749354362488,
+ "learning_rate": 4.513014545459038e-06,
+ "loss": 0.1061,
+ "step": 218
+ },
+ {
+ "epoch": 1.9210526315789473,
+ "grad_norm": 0.7279748916625977,
+ "learning_rate": 4.505010499634427e-06,
+ "loss": 0.1032,
+ "step": 219
+ },
+ {
+ "epoch": 1.9298245614035088,
+ "grad_norm": 0.6331414580345154,
+ "learning_rate": 4.4969484322159125e-06,
+ "loss": 0.1109,
+ "step": 220
+ },
+ {
+ "epoch": 1.9385964912280702,
+ "grad_norm": 0.9024543166160583,
+ "learning_rate": 4.488828576506014e-06,
+ "loss": 0.1094,
+ "step": 221
+ },
+ {
+ "epoch": 1.9473684210526314,
+ "grad_norm": 3.540376901626587,
+ "learning_rate": 4.480651167479545e-06,
+ "loss": 0.1154,
+ "step": 222
+ },
+ {
+ "epoch": 1.956140350877193,
+ "grad_norm": 0.9506739377975464,
+ "learning_rate": 4.472416441776817e-06,
+ "loss": 0.108,
+ "step": 223
+ },
+ {
+ "epoch": 1.9649122807017543,
+ "grad_norm": 0.6585081815719604,
+ "learning_rate": 4.464124637696786e-06,
+ "loss": 0.1033,
+ "step": 224
+ },
+ {
+ "epoch": 1.973684210526316,
+ "grad_norm": 1.143038034439087,
+ "learning_rate": 4.455775995190161e-06,
+ "loss": 0.1092,
+ "step": 225
+ },
+ {
+ "epoch": 1.9824561403508771,
+ "grad_norm": 1.148261547088623,
+ "learning_rate": 4.4473707558524555e-06,
+ "loss": 0.1076,
+ "step": 226
+ },
+ {
+ "epoch": 1.9912280701754386,
+ "grad_norm": 0.7375811338424683,
+ "learning_rate": 4.438909162917003e-06,
+ "loss": 0.108,
+ "step": 227
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.5254591703414917,
+ "learning_rate": 4.430391461247911e-06,
+ "loss": 0.1079,
+ "step": 228
+ },
+ {
+ "epoch": 2.008771929824561,
+ "grad_norm": 1.0198495388031006,
+ "learning_rate": 4.42181789733298e-06,
+ "loss": 0.1083,
+ "step": 229
+ },
+ {
+ "epoch": 2.017543859649123,
+ "grad_norm": 0.9234157800674438,
+ "learning_rate": 4.413188719276569e-06,
+ "loss": 0.1084,
+ "step": 230
+ },
+ {
+ "epoch": 2.026315789473684,
+ "grad_norm": 0.5215068459510803,
+ "learning_rate": 4.404504176792414e-06,
+ "loss": 0.1067,
+ "step": 231
+ },
+ {
+ "epoch": 2.0350877192982457,
+ "grad_norm": 0.9296736121177673,
+ "learning_rate": 4.3957645211964065e-06,
+ "loss": 0.1066,
+ "step": 232
+ },
+ {
+ "epoch": 2.043859649122807,
+ "grad_norm": 0.8660671710968018,
+ "learning_rate": 4.386970005399314e-06,
+ "loss": 0.108,
+ "step": 233
+ },
+ {
+ "epoch": 2.0526315789473686,
+ "grad_norm": 0.6014883518218994,
+ "learning_rate": 4.378120883899467e-06,
+ "loss": 0.1068,
+ "step": 234
+ },
+ {
+ "epoch": 2.06140350877193,
+ "grad_norm": 0.6370371580123901,
+ "learning_rate": 4.369217412775393e-06,
+ "loss": 0.1076,
+ "step": 235
+ },
+ {
+ "epoch": 2.0701754385964914,
+ "grad_norm": 0.9806828498840332,
+ "learning_rate": 4.360259849678402e-06,
+ "loss": 0.1071,
+ "step": 236
+ },
+ {
+ "epoch": 2.0789473684210527,
+ "grad_norm": 0.6093440651893616,
+ "learning_rate": 4.351248453825137e-06,
+ "loss": 0.1038,
+ "step": 237
+ },
+ {
+ "epoch": 2.087719298245614,
+ "grad_norm": 1.3494842052459717,
+ "learning_rate": 4.3421834859900695e-06,
+ "loss": 0.1105,
+ "step": 238
+ },
+ {
+ "epoch": 2.0964912280701755,
+ "grad_norm": 0.7621576189994812,
+ "learning_rate": 4.333065208497949e-06,
+ "loss": 0.1048,
+ "step": 239
+ },
+ {
+ "epoch": 2.1052631578947367,
+ "grad_norm": 0.5918282866477966,
+ "learning_rate": 4.3238938852162195e-06,
+ "loss": 0.1086,
+ "step": 240
+ },
+ {
+ "epoch": 2.1140350877192984,
+ "grad_norm": 0.7048676609992981,
+ "learning_rate": 4.314669781547379e-06,
+ "loss": 0.1061,
+ "step": 241
+ },
+ {
+ "epoch": 2.1228070175438596,
+ "grad_norm": 1.0750821828842163,
+ "learning_rate": 4.305393164421301e-06,
+ "loss": 0.1082,
+ "step": 242
+ },
+ {
+ "epoch": 2.1315789473684212,
+ "grad_norm": 0.6171414852142334,
+ "learning_rate": 4.296064302287507e-06,
+ "loss": 0.1039,
+ "step": 243
+ },
+ {
+ "epoch": 2.1403508771929824,
+ "grad_norm": 0.8080905079841614,
+ "learning_rate": 4.286683465107403e-06,
+ "loss": 0.1069,
+ "step": 244
+ },
+ {
+ "epoch": 2.1491228070175437,
+ "grad_norm": 0.5281466245651245,
+ "learning_rate": 4.277250924346461e-06,
+ "loss": 0.1069,
+ "step": 245
+ },
+ {
+ "epoch": 2.1578947368421053,
+ "grad_norm": 0.8070254325866699,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.1061,
+ "step": 246
+ },
+ {
+ "epoch": 2.1666666666666665,
+ "grad_norm": 0.8560577630996704,
+ "learning_rate": 4.25823182541713e-06,
+ "loss": 0.1116,
+ "step": 247
+ },
+ {
+ "epoch": 2.175438596491228,
+ "grad_norm": 0.7772330045700073,
+ "learning_rate": 4.2486458176291176e-06,
+ "loss": 0.1092,
+ "step": 248
+ },
+ {
+ "epoch": 2.1842105263157894,
+ "grad_norm": 0.814601719379425,
+ "learning_rate": 4.239009207005096e-06,
+ "loss": 0.1093,
+ "step": 249
+ },
+ {
+ "epoch": 2.192982456140351,
+ "grad_norm": 0.957789957523346,
+ "learning_rate": 4.2293222724121855e-06,
+ "loss": 0.1075,
+ "step": 250
+ },
+ {
+ "epoch": 2.2017543859649122,
+ "grad_norm": 0.500062108039856,
+ "learning_rate": 4.219585294173799e-06,
+ "loss": 0.1048,
+ "step": 251
+ },
+ {
+ "epoch": 2.2105263157894735,
+ "grad_norm": 0.3866419792175293,
+ "learning_rate": 4.209798554061527e-06,
+ "loss": 0.1074,
+ "step": 252
+ },
+ {
+ "epoch": 2.219298245614035,
+ "grad_norm": 1.1853291988372803,
+ "learning_rate": 4.199962335286985e-06,
+ "loss": 0.1076,
+ "step": 253
+ },
+ {
+ "epoch": 2.2280701754385963,
+ "grad_norm": 0.36602887511253357,
+ "learning_rate": 4.1900769224936125e-06,
+ "loss": 0.108,
+ "step": 254
+ },
+ {
+ "epoch": 2.236842105263158,
+ "grad_norm": 0.2530711889266968,
+ "learning_rate": 4.180142601748447e-06,
+ "loss": 0.1041,
+ "step": 255
+ },
+ {
+ "epoch": 2.245614035087719,
+ "grad_norm": 1.3067054748535156,
+ "learning_rate": 4.170159660533834e-06,
+ "loss": 0.1087,
+ "step": 256
+ },
+ {
+ "epoch": 2.254385964912281,
+ "grad_norm": 0.3442043960094452,
+ "learning_rate": 4.160128387739114e-06,
+ "loss": 0.1099,
+ "step": 257
+ },
+ {
+ "epoch": 2.263157894736842,
+ "grad_norm": 1.174796462059021,
+ "learning_rate": 4.150049073652262e-06,
+ "loss": 0.1063,
+ "step": 258
+ },
+ {
+ "epoch": 2.2719298245614037,
+ "grad_norm": 0.5719411969184875,
+ "learning_rate": 4.1399220099514845e-06,
+ "loss": 0.1043,
+ "step": 259
+ },
+ {
+ "epoch": 2.280701754385965,
+ "grad_norm": 0.7268956303596497,
+ "learning_rate": 4.129747489696781e-06,
+ "loss": 0.1038,
+ "step": 260
+ },
+ {
+ "epoch": 2.2894736842105265,
+ "grad_norm": 0.7028316259384155,
+ "learning_rate": 4.119525807321467e-06,
+ "loss": 0.1052,
+ "step": 261
+ },
+ {
+ "epoch": 2.2982456140350878,
+ "grad_norm": 1.015335202217102,
+ "learning_rate": 4.109257258623644e-06,
+ "loss": 0.1116,
+ "step": 262
+ },
+ {
+ "epoch": 2.307017543859649,
+ "grad_norm": 0.7141755819320679,
+ "learning_rate": 4.098942140757646e-06,
+ "loss": 0.108,
+ "step": 263
+ },
+ {
+ "epoch": 2.3157894736842106,
+ "grad_norm": 0.7656403183937073,
+ "learning_rate": 4.0885807522254435e-06,
+ "loss": 0.1043,
+ "step": 264
+ },
+ {
+ "epoch": 2.324561403508772,
+ "grad_norm": 0.43293774127960205,
+ "learning_rate": 4.078173392867998e-06,
+ "loss": 0.1048,
+ "step": 265
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 0.6755763292312622,
+ "learning_rate": 4.0677203638565895e-06,
+ "loss": 0.1064,
+ "step": 266
+ },
+ {
+ "epoch": 2.3421052631578947,
+ "grad_norm": 0.9648827314376831,
+ "learning_rate": 4.0572219676841e-06,
+ "loss": 0.1088,
+ "step": 267
+ },
+ {
+ "epoch": 2.3508771929824563,
+ "grad_norm": 0.32724836468696594,
+ "learning_rate": 4.046678508156259e-06,
+ "loss": 0.1077,
+ "step": 268
+ },
+ {
+ "epoch": 2.3596491228070176,
+ "grad_norm": 0.4696657061576843,
+ "learning_rate": 4.036090290382855e-06,
+ "loss": 0.1067,
+ "step": 269
+ },
+ {
+ "epoch": 2.3684210526315788,
+ "grad_norm": 0.33901306986808777,
+ "learning_rate": 4.025457620768901e-06,
+ "loss": 0.105,
+ "step": 270
+ },
+ {
+ "epoch": 2.3771929824561404,
+ "grad_norm": 0.5703794360160828,
+ "learning_rate": 4.014780807005775e-06,
+ "loss": 0.1033,
+ "step": 271
+ },
+ {
+ "epoch": 2.3859649122807016,
+ "grad_norm": 0.9639355540275574,
+ "learning_rate": 4.004060158062306e-06,
+ "loss": 0.1041,
+ "step": 272
+ },
+ {
+ "epoch": 2.3947368421052633,
+ "grad_norm": 0.8851558566093445,
+ "learning_rate": 3.993295984175845e-06,
+ "loss": 0.1064,
+ "step": 273
+ },
+ {
+ "epoch": 2.4035087719298245,
+ "grad_norm": 0.5200062990188599,
+ "learning_rate": 3.982488596843276e-06,
+ "loss": 0.1056,
+ "step": 274
+ },
+ {
+ "epoch": 2.412280701754386,
+ "grad_norm": 1.160823106765747,
+ "learning_rate": 3.971638308812007e-06,
+ "loss": 0.1069,
+ "step": 275
+ },
+ {
+ "epoch": 2.4210526315789473,
+ "grad_norm": 1.0191210508346558,
+ "learning_rate": 3.9607454340709215e-06,
+ "loss": 0.1042,
+ "step": 276
+ },
+ {
+ "epoch": 2.4298245614035086,
+ "grad_norm": 0.37181487679481506,
+ "learning_rate": 3.949810287841289e-06,
+ "loss": 0.1062,
+ "step": 277
+ },
+ {
+ "epoch": 2.43859649122807,
+ "grad_norm": 0.9328593611717224,
+ "learning_rate": 3.9388331865676436e-06,
+ "loss": 0.1086,
+ "step": 278
+ },
+ {
+ "epoch": 2.4473684210526314,
+ "grad_norm": 0.8024734258651733,
+ "learning_rate": 3.927814447908625e-06,
+ "loss": 0.1051,
+ "step": 279
+ },
+ {
+ "epoch": 2.456140350877193,
+ "grad_norm": 0.9746696352958679,
+ "learning_rate": 3.916754390727795e-06,
+ "loss": 0.1041,
+ "step": 280
+ },
+ {
+ "epoch": 2.4649122807017543,
+ "grad_norm": 0.5457844138145447,
+ "learning_rate": 3.905653335084394e-06,
+ "loss": 0.1052,
+ "step": 281
+ },
+ {
+ "epoch": 2.473684210526316,
+ "grad_norm": 1.0736924409866333,
+ "learning_rate": 3.8945116022240945e-06,
+ "loss": 0.1075,
+ "step": 282
+ },
+ {
+ "epoch": 2.482456140350877,
+ "grad_norm": 0.6335628032684326,
+ "learning_rate": 3.8833295145696964e-06,
+ "loss": 0.1036,
+ "step": 283
+ },
+ {
+ "epoch": 2.4912280701754383,
+ "grad_norm": 0.6909618377685547,
+ "learning_rate": 3.872107395711799e-06,
+ "loss": 0.1089,
+ "step": 284
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 2.1871702671051025,
+ "learning_rate": 3.860845570399435e-06,
+ "loss": 0.1066,
+ "step": 285
+ },
+ {
+ "epoch": 2.5087719298245617,
+ "grad_norm": 0.5831722617149353,
+ "learning_rate": 3.849544364530678e-06,
+ "loss": 0.1055,
+ "step": 286
+ },
+ {
+ "epoch": 2.517543859649123,
+ "grad_norm": 0.5302637815475464,
+ "learning_rate": 3.838204105143204e-06,
+ "loss": 0.1057,
+ "step": 287
+ },
+ {
+ "epoch": 2.526315789473684,
+ "grad_norm": 0.6348035931587219,
+ "learning_rate": 3.8268251204048335e-06,
+ "loss": 0.1089,
+ "step": 288
+ },
+ {
+ "epoch": 2.5350877192982457,
+ "grad_norm": 2.1932008266448975,
+ "learning_rate": 3.815407739604033e-06,
+ "loss": 0.1043,
+ "step": 289
+ },
+ {
+ "epoch": 2.543859649122807,
+ "grad_norm": 0.4388940930366516,
+ "learning_rate": 3.803952293140385e-06,
+ "loss": 0.1055,
+ "step": 290
+ },
+ {
+ "epoch": 2.5526315789473686,
+ "grad_norm": 0.6853339076042175,
+ "learning_rate": 3.7924591125150265e-06,
+ "loss": 0.1036,
+ "step": 291
+ },
+ {
+ "epoch": 2.56140350877193,
+ "grad_norm": 0.34744876623153687,
+ "learning_rate": 3.78092853032106e-06,
+ "loss": 0.1025,
+ "step": 292
+ },
+ {
+ "epoch": 2.5701754385964914,
+ "grad_norm": 0.9523847699165344,
+ "learning_rate": 3.769360880233922e-06,
+ "loss": 0.1067,
+ "step": 293
+ },
+ {
+ "epoch": 2.5789473684210527,
+ "grad_norm": 1.303745985031128,
+ "learning_rate": 3.7577564970017338e-06,
+ "loss": 0.1082,
+ "step": 294
+ },
+ {
+ "epoch": 2.587719298245614,
+ "grad_norm": 0.9468981623649597,
+ "learning_rate": 3.7461157164356103e-06,
+ "loss": 0.1055,
+ "step": 295
+ },
+ {
+ "epoch": 2.5964912280701755,
+ "grad_norm": 0.7204175591468811,
+ "learning_rate": 3.7344388753999434e-06,
+ "loss": 0.1055,
+ "step": 296
+ },
+ {
+ "epoch": 2.6052631578947367,
+ "grad_norm": 0.5110165476799011,
+ "learning_rate": 3.7227263118026537e-06,
+ "loss": 0.1092,
+ "step": 297
+ },
+ {
+ "epoch": 2.6140350877192984,
+ "grad_norm": 0.6483246088027954,
+ "learning_rate": 3.7109783645854116e-06,
+ "loss": 0.1078,
+ "step": 298
+ },
+ {
+ "epoch": 2.6228070175438596,
+ "grad_norm": 0.5058422684669495,
+ "learning_rate": 3.699195373713831e-06,
+ "loss": 0.1073,
+ "step": 299
+ },
+ {
+ "epoch": 2.6315789473684212,
+ "grad_norm": 0.4123518764972687,
+ "learning_rate": 3.6873776801676265e-06,
+ "loss": 0.1053,
+ "step": 300
+ },
+ {
+ "epoch": 2.6403508771929824,
+ "grad_norm": 1.0864709615707397,
+ "learning_rate": 3.675525625930751e-06,
+ "loss": 0.1048,
+ "step": 301
+ },
+ {
+ "epoch": 2.6491228070175437,
+ "grad_norm": 1.0264904499053955,
+ "learning_rate": 3.6636395539814975e-06,
+ "loss": 0.1059,
+ "step": 302
+ },
+ {
+ "epoch": 2.6578947368421053,
+ "grad_norm": 0.7724822163581848,
+ "learning_rate": 3.651719808282573e-06,
+ "loss": 0.1063,
+ "step": 303
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 0.7474755644798279,
+ "learning_rate": 3.6397667337711475e-06,
+ "loss": 0.1034,
+ "step": 304
+ },
+ {
+ "epoch": 2.675438596491228,
+ "grad_norm": 0.5628909468650818,
+ "learning_rate": 3.6277806763488666e-06,
+ "loss": 0.1026,
+ "step": 305
+ },
+ {
+ "epoch": 2.6842105263157894,
+ "grad_norm": 0.9070547819137573,
+ "learning_rate": 3.6157619828718477e-06,
+ "loss": 0.1031,
+ "step": 306
+ },
+ {
+ "epoch": 2.692982456140351,
+ "grad_norm": 0.6968091130256653,
+ "learning_rate": 3.603711001140641e-06,
+ "loss": 0.1068,
+ "step": 307
+ },
+ {
+ "epoch": 2.7017543859649122,
+ "grad_norm": 0.3764977753162384,
+ "learning_rate": 3.5916280798901604e-06,
+ "loss": 0.1038,
+ "step": 308
+ },
+ {
+ "epoch": 2.7105263157894735,
+ "grad_norm": 5.012625694274902,
+ "learning_rate": 3.5795135687795984e-06,
+ "loss": 0.1129,
+ "step": 309
+ },
+ {
+ "epoch": 2.719298245614035,
+ "grad_norm": 0.6745572686195374,
+ "learning_rate": 3.567367818382303e-06,
+ "loss": 0.1071,
+ "step": 310
+ },
+ {
+ "epoch": 2.7280701754385968,
+ "grad_norm": 1.0659606456756592,
+ "learning_rate": 3.555191180175634e-06,
+ "loss": 0.1067,
+ "step": 311
+ },
+ {
+ "epoch": 2.736842105263158,
+ "grad_norm": 1.7312604188919067,
+ "learning_rate": 3.5429840065307924e-06,
+ "loss": 0.1101,
+ "step": 312
+ },
+ {
+ "epoch": 2.745614035087719,
+ "grad_norm": 1.100364327430725,
+ "learning_rate": 3.5307466507026223e-06,
+ "loss": 0.1098,
+ "step": 313
+ },
+ {
+ "epoch": 2.754385964912281,
+ "grad_norm": 1.0390428304672241,
+ "learning_rate": 3.5184794668193893e-06,
+ "loss": 0.1094,
+ "step": 314
+ },
+ {
+ "epoch": 2.763157894736842,
+ "grad_norm": 0.3369971811771393,
+ "learning_rate": 3.5061828098725327e-06,
+ "loss": 0.1053,
+ "step": 315
+ },
+ {
+ "epoch": 2.7719298245614032,
+ "grad_norm": 0.6130257248878479,
+ "learning_rate": 3.4938570357063906e-06,
+ "loss": 0.106,
+ "step": 316
+ },
+ {
+ "epoch": 2.780701754385965,
+ "grad_norm": 0.6387595534324646,
+ "learning_rate": 3.481502501007904e-06,
+ "loss": 0.1044,
+ "step": 317
+ },
+ {
+ "epoch": 2.7894736842105265,
+ "grad_norm": 1.0731587409973145,
+ "learning_rate": 3.469119563296296e-06,
+ "loss": 0.1097,
+ "step": 318
+ },
+ {
+ "epoch": 2.7982456140350878,
+ "grad_norm": 0.8096229434013367,
+ "learning_rate": 3.4567085809127247e-06,
+ "loss": 0.1076,
+ "step": 319
+ },
+ {
+ "epoch": 2.807017543859649,
+ "grad_norm": 0.5034844279289246,
+ "learning_rate": 3.444269913009912e-06,
+ "loss": 0.1071,
+ "step": 320
+ },
+ {
+ "epoch": 2.8157894736842106,
+ "grad_norm": 0.675139307975769,
+ "learning_rate": 3.4318039195417536e-06,
+ "loss": 0.1039,
+ "step": 321
+ },
+ {
+ "epoch": 2.824561403508772,
+ "grad_norm": 0.7330355644226074,
+ "learning_rate": 3.4193109612528972e-06,
+ "loss": 0.1044,
+ "step": 322
+ },
+ {
+ "epoch": 2.8333333333333335,
+ "grad_norm": 0.6558271646499634,
+ "learning_rate": 3.4067913996683115e-06,
+ "loss": 0.1051,
+ "step": 323
+ },
+ {
+ "epoch": 2.8421052631578947,
+ "grad_norm": 0.8411844372749329,
+ "learning_rate": 3.3942455970828146e-06,
+ "loss": 0.1063,
+ "step": 324
+ },
+ {
+ "epoch": 2.8508771929824563,
+ "grad_norm": 0.4817325174808502,
+ "learning_rate": 3.3816739165505964e-06,
+ "loss": 0.105,
+ "step": 325
+ },
+ {
+ "epoch": 2.8596491228070176,
+ "grad_norm": 0.424554705619812,
+ "learning_rate": 3.3690767218747104e-06,
+ "loss": 0.1037,
+ "step": 326
+ },
+ {
+ "epoch": 2.8684210526315788,
+ "grad_norm": 1.0054417848587036,
+ "learning_rate": 3.3564543775965475e-06,
+ "loss": 0.1058,
+ "step": 327
+ },
+ {
+ "epoch": 2.8771929824561404,
+ "grad_norm": 0.8984584808349609,
+ "learning_rate": 3.3438072489852837e-06,
+ "loss": 0.1079,
+ "step": 328
+ },
+ {
+ "epoch": 2.8859649122807016,
+ "grad_norm": 0.6779558062553406,
+ "learning_rate": 3.331135702027311e-06,
+ "loss": 0.1046,
+ "step": 329
+ },
+ {
+ "epoch": 2.8947368421052633,
+ "grad_norm": 0.6931657195091248,
+ "learning_rate": 3.318440103415649e-06,
+ "loss": 0.1106,
+ "step": 330
+ },
+ {
+ "epoch": 2.9035087719298245,
+ "grad_norm": 0.705264151096344,
+ "learning_rate": 3.305720820539329e-06,
+ "loss": 0.104,
+ "step": 331
+ },
+ {
+ "epoch": 2.912280701754386,
+ "grad_norm": 0.7799407839775085,
+ "learning_rate": 3.2929782214727657e-06,
+ "loss": 0.1019,
+ "step": 332
+ },
+ {
+ "epoch": 2.9210526315789473,
+ "grad_norm": 0.7583760619163513,
+ "learning_rate": 3.2802126749651042e-06,
+ "loss": 0.1049,
+ "step": 333
+ },
+ {
+ "epoch": 2.9298245614035086,
+ "grad_norm": 0.6145837306976318,
+ "learning_rate": 3.2674245504295505e-06,
+ "loss": 0.104,
+ "step": 334
+ },
+ {
+ "epoch": 2.93859649122807,
+ "grad_norm": 0.5170779228210449,
+ "learning_rate": 3.254614217932679e-06,
+ "loss": 0.1024,
+ "step": 335
+ },
+ {
+ "epoch": 2.9473684210526314,
+ "grad_norm": 0.6850940585136414,
+ "learning_rate": 3.241782048183726e-06,
+ "loss": 0.1047,
+ "step": 336
+ },
+ {
+ "epoch": 2.956140350877193,
+ "grad_norm": 0.7307694554328918,
+ "learning_rate": 3.2289284125238597e-06,
+ "loss": 0.1032,
+ "step": 337
+ },
+ {
+ "epoch": 2.9649122807017543,
+ "grad_norm": 0.3386179208755493,
+ "learning_rate": 3.216053682915436e-06,
+ "loss": 0.1037,
+ "step": 338
+ },
+ {
+ "epoch": 2.973684210526316,
+ "grad_norm": 0.7565059065818787,
+ "learning_rate": 3.203158231931234e-06,
+ "loss": 0.1048,
+ "step": 339
+ },
+ {
+ "epoch": 2.982456140350877,
+ "grad_norm": 0.7902039289474487,
+ "learning_rate": 3.190242432743673e-06,
+ "loss": 0.1068,
+ "step": 340
+ },
+ {
+ "epoch": 2.9912280701754383,
+ "grad_norm": 0.42595192790031433,
+ "learning_rate": 3.177306659114015e-06,
+ "loss": 0.1039,
+ "step": 341
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.1214542388916016,
+ "learning_rate": 3.164351285381549e-06,
+ "loss": 0.1062,
+ "step": 342
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.037997022244556e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-342/training_args.bin b/checkpoint-342/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-342/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-342/zero_to_fp32.py b/checkpoint-342/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-342/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-456/README.md b/checkpoint-456/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-456/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-456/adapter_config.json b/checkpoint-456/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-456/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-456/adapter_model.safetensors b/checkpoint-456/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..42a164c19b8991d795745b2be7f51614f9e1c94c
--- /dev/null
+++ b/checkpoint-456/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dccdb7039918f5ffc78444d0e12eeaba609108a1ca06b93d76a6d876e6261bed
+size 9016826528
diff --git a/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..28c0b48334ed509e21364c83067d53c9aa7f48a3
--- /dev/null
+++ b/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef6d7f29aa4de04e9a43a18bc91e2b805f98f74bd58a41ad903e02e7d0892d90
+size 27050164444
diff --git a/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68ac964fceeee099e6785a218a177277061604a6
--- /dev/null
+++ b/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3b0512235d477b9bbfecacb366aafb4262703f193d00e52ab75dd8d8d57866
+size 27050169884
diff --git a/checkpoint-456/global_step456/mp_rank_00_model_states.pt b/checkpoint-456/global_step456/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca0ec2806350c36f92e6af618cec77020e3329b3
--- /dev/null
+++ b/checkpoint-456/global_step456/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d25d7823729e2c703bfc8a32d1be4f363a6cf4448ba82fa6c7c3fed7bc41780
+size 9776788601
diff --git a/checkpoint-456/latest b/checkpoint-456/latest
new file mode 100644
index 0000000000000000000000000000000000000000..dbd5ff49aa710762c49b97ba3da2fe7861cf8ba3
--- /dev/null
+++ b/checkpoint-456/latest
@@ -0,0 +1 @@
+global_step456
\ No newline at end of file
diff --git a/checkpoint-456/rng_state_0.pth b/checkpoint-456/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ae0b1a9decf4ec9bb35071035fb26c2d4c93b67e
--- /dev/null
+++ b/checkpoint-456/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:558cb392136e9a34f3dc978e709dbf7e921016d196633280baa3af2f9b835feb
+size 14512
diff --git a/checkpoint-456/rng_state_1.pth b/checkpoint-456/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..47cbe05f6d15e006bbe6a3733bfe0cfdc100ba87
--- /dev/null
+++ b/checkpoint-456/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c56add9c7ad678528b4506397c292f971d6ba5e1526ee57775b0f10a018460b
+size 14512
diff --git a/checkpoint-456/scheduler.pt b/checkpoint-456/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b1c0ffb3e608b05301b650729d1fd00684fe1c8
--- /dev/null
+++ b/checkpoint-456/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb3bad9ba2764b50552b04ba92b9b14a087b1672d360cfafb090b7313e46de9c
+size 1064
diff --git a/checkpoint-456/special_tokens_map.json b/checkpoint-456/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-456/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-456/tokenizer.json b/checkpoint-456/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-456/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-456/tokenizer_config.json b/checkpoint-456/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-456/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-456/trainer_state.json b/checkpoint-456/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..52bcc74e2542bd0952c8ea0398287f8bb21489d7
--- /dev/null
+++ b/checkpoint-456/trainer_state.json
@@ -0,0 +1,3225 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.0,
+ "eval_steps": 500,
+ "global_step": 456,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ },
+ {
+ "epoch": 1.0087719298245614,
+ "grad_norm": 30.44986343383789,
+ "learning_rate": 4.991865498871647e-06,
+ "loss": 0.2606,
+ "step": 115
+ },
+ {
+ "epoch": 1.0175438596491229,
+ "grad_norm": 14.421515464782715,
+ "learning_rate": 4.99074543733652e-06,
+ "loss": 0.2394,
+ "step": 116
+ },
+ {
+ "epoch": 1.0263157894736843,
+ "grad_norm": 14.072005271911621,
+ "learning_rate": 4.989553297864489e-06,
+ "loss": 0.2288,
+ "step": 117
+ },
+ {
+ "epoch": 1.0350877192982457,
+ "grad_norm": 4.395325660705566,
+ "learning_rate": 4.988289114954045e-06,
+ "loss": 0.2129,
+ "step": 118
+ },
+ {
+ "epoch": 1.043859649122807,
+ "grad_norm": 7.286703586578369,
+ "learning_rate": 4.986952925188489e-06,
+ "loss": 0.186,
+ "step": 119
+ },
+ {
+ "epoch": 1.0526315789473684,
+ "grad_norm": 8.332784652709961,
+ "learning_rate": 4.98554476723488e-06,
+ "loss": 0.178,
+ "step": 120
+ },
+ {
+ "epoch": 1.0614035087719298,
+ "grad_norm": 1.3646447658538818,
+ "learning_rate": 4.984064681842917e-06,
+ "loss": 0.1687,
+ "step": 121
+ },
+ {
+ "epoch": 1.0701754385964912,
+ "grad_norm": 4.494940757751465,
+ "learning_rate": 4.982512711843753e-06,
+ "loss": 0.1881,
+ "step": 122
+ },
+ {
+ "epoch": 1.0789473684210527,
+ "grad_norm": 3.3929836750030518,
+ "learning_rate": 4.980888902148757e-06,
+ "loss": 0.1764,
+ "step": 123
+ },
+ {
+ "epoch": 1.087719298245614,
+ "grad_norm": 1.8281155824661255,
+ "learning_rate": 4.979193299748225e-06,
+ "loss": 0.1602,
+ "step": 124
+ },
+ {
+ "epoch": 1.0964912280701755,
+ "grad_norm": 3.494239568710327,
+ "learning_rate": 4.977425953710005e-06,
+ "loss": 0.1729,
+ "step": 125
+ },
+ {
+ "epoch": 1.1052631578947367,
+ "grad_norm": 1.500410556793213,
+ "learning_rate": 4.975586915178084e-06,
+ "loss": 0.1666,
+ "step": 126
+ },
+ {
+ "epoch": 1.1140350877192982,
+ "grad_norm": 1.4680222272872925,
+ "learning_rate": 4.973676237371111e-06,
+ "loss": 0.159,
+ "step": 127
+ },
+ {
+ "epoch": 1.1228070175438596,
+ "grad_norm": 3.0383460521698,
+ "learning_rate": 4.971693975580851e-06,
+ "loss": 0.1484,
+ "step": 128
+ },
+ {
+ "epoch": 1.131578947368421,
+ "grad_norm": 3.74821138381958,
+ "learning_rate": 4.969640187170591e-06,
+ "loss": 0.1586,
+ "step": 129
+ },
+ {
+ "epoch": 1.1403508771929824,
+ "grad_norm": 4.682602405548096,
+ "learning_rate": 4.967514931573473e-06,
+ "loss": 0.1619,
+ "step": 130
+ },
+ {
+ "epoch": 1.1491228070175439,
+ "grad_norm": 3.90673565864563,
+ "learning_rate": 4.965318270290779e-06,
+ "loss": 0.164,
+ "step": 131
+ },
+ {
+ "epoch": 1.1578947368421053,
+ "grad_norm": 2.2017388343811035,
+ "learning_rate": 4.963050266890152e-06,
+ "loss": 0.1499,
+ "step": 132
+ },
+ {
+ "epoch": 1.1666666666666667,
+ "grad_norm": 2.4211816787719727,
+ "learning_rate": 4.960710987003753e-06,
+ "loss": 0.1387,
+ "step": 133
+ },
+ {
+ "epoch": 1.1754385964912282,
+ "grad_norm": 1.7753759622573853,
+ "learning_rate": 4.958300498326363e-06,
+ "loss": 0.1441,
+ "step": 134
+ },
+ {
+ "epoch": 1.1842105263157894,
+ "grad_norm": 1.5529910326004028,
+ "learning_rate": 4.955818870613425e-06,
+ "loss": 0.1304,
+ "step": 135
+ },
+ {
+ "epoch": 1.1929824561403508,
+ "grad_norm": 2.090593099594116,
+ "learning_rate": 4.953266175679023e-06,
+ "loss": 0.1419,
+ "step": 136
+ },
+ {
+ "epoch": 1.2017543859649122,
+ "grad_norm": 2.7141878604888916,
+ "learning_rate": 4.95064248739381e-06,
+ "loss": 0.1444,
+ "step": 137
+ },
+ {
+ "epoch": 1.2105263157894737,
+ "grad_norm": 2.3690481185913086,
+ "learning_rate": 4.947947881682861e-06,
+ "loss": 0.1383,
+ "step": 138
+ },
+ {
+ "epoch": 1.219298245614035,
+ "grad_norm": 2.2403147220611572,
+ "learning_rate": 4.945182436523482e-06,
+ "loss": 0.1418,
+ "step": 139
+ },
+ {
+ "epoch": 1.2280701754385965,
+ "grad_norm": 1.3939160108566284,
+ "learning_rate": 4.942346231942955e-06,
+ "loss": 0.1307,
+ "step": 140
+ },
+ {
+ "epoch": 1.236842105263158,
+ "grad_norm": 11.276732444763184,
+ "learning_rate": 4.939439350016214e-06,
+ "loss": 0.1397,
+ "step": 141
+ },
+ {
+ "epoch": 1.2456140350877192,
+ "grad_norm": 8.260516166687012,
+ "learning_rate": 4.9364618748634794e-06,
+ "loss": 0.1426,
+ "step": 142
+ },
+ {
+ "epoch": 1.2543859649122808,
+ "grad_norm": 2.09720516204834,
+ "learning_rate": 4.933413892647819e-06,
+ "loss": 0.1323,
+ "step": 143
+ },
+ {
+ "epoch": 1.263157894736842,
+ "grad_norm": 1.802125334739685,
+ "learning_rate": 4.9302954915726535e-06,
+ "loss": 0.1304,
+ "step": 144
+ },
+ {
+ "epoch": 1.2719298245614035,
+ "grad_norm": 1.7151471376419067,
+ "learning_rate": 4.927106761879207e-06,
+ "loss": 0.1264,
+ "step": 145
+ },
+ {
+ "epoch": 1.280701754385965,
+ "grad_norm": 1.6970336437225342,
+ "learning_rate": 4.923847795843894e-06,
+ "loss": 0.1227,
+ "step": 146
+ },
+ {
+ "epoch": 1.2894736842105263,
+ "grad_norm": 16.60441017150879,
+ "learning_rate": 4.920518687775647e-06,
+ "loss": 0.1606,
+ "step": 147
+ },
+ {
+ "epoch": 1.2982456140350878,
+ "grad_norm": 6.470354080200195,
+ "learning_rate": 4.917119534013194e-06,
+ "loss": 0.1447,
+ "step": 148
+ },
+ {
+ "epoch": 1.3070175438596492,
+ "grad_norm": 1.4908231496810913,
+ "learning_rate": 4.913650432922264e-06,
+ "loss": 0.1343,
+ "step": 149
+ },
+ {
+ "epoch": 1.3157894736842106,
+ "grad_norm": 3.19964861869812,
+ "learning_rate": 4.91011148489274e-06,
+ "loss": 0.1354,
+ "step": 150
+ },
+ {
+ "epoch": 1.3245614035087718,
+ "grad_norm": 2.6052839756011963,
+ "learning_rate": 4.906502792335761e-06,
+ "loss": 0.1342,
+ "step": 151
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.0719165802001953,
+ "learning_rate": 4.9028244596807525e-06,
+ "loss": 0.1359,
+ "step": 152
+ },
+ {
+ "epoch": 1.3421052631578947,
+ "grad_norm": 0.8086919784545898,
+ "learning_rate": 4.899076593372405e-06,
+ "loss": 0.1279,
+ "step": 153
+ },
+ {
+ "epoch": 1.3508771929824561,
+ "grad_norm": 1.0056848526000977,
+ "learning_rate": 4.8952593018675955e-06,
+ "loss": 0.1162,
+ "step": 154
+ },
+ {
+ "epoch": 1.3596491228070176,
+ "grad_norm": 5.72553014755249,
+ "learning_rate": 4.891372695632249e-06,
+ "loss": 0.1315,
+ "step": 155
+ },
+ {
+ "epoch": 1.368421052631579,
+ "grad_norm": 1.522894024848938,
+ "learning_rate": 4.887416887138139e-06,
+ "loss": 0.1266,
+ "step": 156
+ },
+ {
+ "epoch": 1.3771929824561404,
+ "grad_norm": 2.019472122192383,
+ "learning_rate": 4.883391990859635e-06,
+ "loss": 0.1262,
+ "step": 157
+ },
+ {
+ "epoch": 1.3859649122807016,
+ "grad_norm": 1.8594422340393066,
+ "learning_rate": 4.879298123270391e-06,
+ "loss": 0.125,
+ "step": 158
+ },
+ {
+ "epoch": 1.3947368421052633,
+ "grad_norm": 1.365377426147461,
+ "learning_rate": 4.8751354028399725e-06,
+ "loss": 0.1218,
+ "step": 159
+ },
+ {
+ "epoch": 1.4035087719298245,
+ "grad_norm": 3.553309917449951,
+ "learning_rate": 4.870903950030429e-06,
+ "loss": 0.1272,
+ "step": 160
+ },
+ {
+ "epoch": 1.412280701754386,
+ "grad_norm": 2.1770920753479004,
+ "learning_rate": 4.866603887292809e-06,
+ "loss": 0.1213,
+ "step": 161
+ },
+ {
+ "epoch": 1.4210526315789473,
+ "grad_norm": 1.6058955192565918,
+ "learning_rate": 4.862235339063613e-06,
+ "loss": 0.1173,
+ "step": 162
+ },
+ {
+ "epoch": 1.4298245614035088,
+ "grad_norm": 1.3208314180374146,
+ "learning_rate": 4.857798431761199e-06,
+ "loss": 0.1183,
+ "step": 163
+ },
+ {
+ "epoch": 1.4385964912280702,
+ "grad_norm": 1.282729983329773,
+ "learning_rate": 4.853293293782118e-06,
+ "loss": 0.1209,
+ "step": 164
+ },
+ {
+ "epoch": 1.4473684210526316,
+ "grad_norm": 1.3838152885437012,
+ "learning_rate": 4.848720055497401e-06,
+ "loss": 0.1198,
+ "step": 165
+ },
+ {
+ "epoch": 1.456140350877193,
+ "grad_norm": 1.2930737733840942,
+ "learning_rate": 4.844078849248785e-06,
+ "loss": 0.1268,
+ "step": 166
+ },
+ {
+ "epoch": 1.4649122807017543,
+ "grad_norm": 1.7022266387939453,
+ "learning_rate": 4.839369809344888e-06,
+ "loss": 0.1198,
+ "step": 167
+ },
+ {
+ "epoch": 1.4736842105263157,
+ "grad_norm": 1.0927815437316895,
+ "learning_rate": 4.834593072057313e-06,
+ "loss": 0.1132,
+ "step": 168
+ },
+ {
+ "epoch": 1.4824561403508771,
+ "grad_norm": 0.9326333999633789,
+ "learning_rate": 4.829748775616716e-06,
+ "loss": 0.1193,
+ "step": 169
+ },
+ {
+ "epoch": 1.4912280701754386,
+ "grad_norm": 1.3564742803573608,
+ "learning_rate": 4.8248370602087954e-06,
+ "loss": 0.118,
+ "step": 170
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 1.19778573513031,
+ "learning_rate": 4.819858067970243e-06,
+ "loss": 0.1122,
+ "step": 171
+ },
+ {
+ "epoch": 1.5087719298245614,
+ "grad_norm": 2.8438351154327393,
+ "learning_rate": 4.814811942984625e-06,
+ "loss": 0.1217,
+ "step": 172
+ },
+ {
+ "epoch": 1.5175438596491229,
+ "grad_norm": 1.0701063871383667,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.1114,
+ "step": 173
+ },
+ {
+ "epoch": 1.526315789473684,
+ "grad_norm": 0.9053553938865662,
+ "learning_rate": 4.804518880815776e-06,
+ "loss": 0.1178,
+ "step": 174
+ },
+ {
+ "epoch": 1.5350877192982457,
+ "grad_norm": 0.42274603247642517,
+ "learning_rate": 4.799272241496259e-06,
+ "loss": 0.1091,
+ "step": 175
+ },
+ {
+ "epoch": 1.543859649122807,
+ "grad_norm": 0.8576470017433167,
+ "learning_rate": 4.793959065148484e-06,
+ "loss": 0.1134,
+ "step": 176
+ },
+ {
+ "epoch": 1.5526315789473686,
+ "grad_norm": 0.5910662412643433,
+ "learning_rate": 4.78857950552674e-06,
+ "loss": 0.1148,
+ "step": 177
+ },
+ {
+ "epoch": 1.5614035087719298,
+ "grad_norm": 0.8761632442474365,
+ "learning_rate": 4.783133718306331e-06,
+ "loss": 0.1125,
+ "step": 178
+ },
+ {
+ "epoch": 1.5701754385964912,
+ "grad_norm": 1.9190795421600342,
+ "learning_rate": 4.777621861079079e-06,
+ "loss": 0.1148,
+ "step": 179
+ },
+ {
+ "epoch": 1.5789473684210527,
+ "grad_norm": 0.6199957728385925,
+ "learning_rate": 4.772044093348757e-06,
+ "loss": 0.1097,
+ "step": 180
+ },
+ {
+ "epoch": 1.587719298245614,
+ "grad_norm": 1.562089443206787,
+ "learning_rate": 4.766400576526479e-06,
+ "loss": 0.1097,
+ "step": 181
+ },
+ {
+ "epoch": 1.5964912280701755,
+ "grad_norm": 1.4957091808319092,
+ "learning_rate": 4.760691473926021e-06,
+ "loss": 0.1216,
+ "step": 182
+ },
+ {
+ "epoch": 1.6052631578947367,
+ "grad_norm": 0.9863570332527161,
+ "learning_rate": 4.754916950759105e-06,
+ "loss": 0.1122,
+ "step": 183
+ },
+ {
+ "epoch": 1.6140350877192984,
+ "grad_norm": 0.5803346633911133,
+ "learning_rate": 4.749077174130609e-06,
+ "loss": 0.1103,
+ "step": 184
+ },
+ {
+ "epoch": 1.6228070175438596,
+ "grad_norm": 1.8789891004562378,
+ "learning_rate": 4.743172313033738e-06,
+ "loss": 0.1191,
+ "step": 185
+ },
+ {
+ "epoch": 1.631578947368421,
+ "grad_norm": 0.8731380105018616,
+ "learning_rate": 4.7372025383451285e-06,
+ "loss": 0.1154,
+ "step": 186
+ },
+ {
+ "epoch": 1.6403508771929824,
+ "grad_norm": 1.3535627126693726,
+ "learning_rate": 4.7311680228199075e-06,
+ "loss": 0.1123,
+ "step": 187
+ },
+ {
+ "epoch": 1.6491228070175439,
+ "grad_norm": 0.7211089134216309,
+ "learning_rate": 4.725068941086693e-06,
+ "loss": 0.1134,
+ "step": 188
+ },
+ {
+ "epoch": 1.6578947368421053,
+ "grad_norm": 1.4752328395843506,
+ "learning_rate": 4.718905469642534e-06,
+ "loss": 0.1185,
+ "step": 189
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 0.9822680354118347,
+ "learning_rate": 4.712677786847814e-06,
+ "loss": 0.1146,
+ "step": 190
+ },
+ {
+ "epoch": 1.6754385964912282,
+ "grad_norm": 1.1308330297470093,
+ "learning_rate": 4.706386072921083e-06,
+ "loss": 0.1061,
+ "step": 191
+ },
+ {
+ "epoch": 1.6842105263157894,
+ "grad_norm": 5.331939697265625,
+ "learning_rate": 4.70003050993384e-06,
+ "loss": 0.1153,
+ "step": 192
+ },
+ {
+ "epoch": 1.692982456140351,
+ "grad_norm": 0.6911673545837402,
+ "learning_rate": 4.6936112818052674e-06,
+ "loss": 0.1098,
+ "step": 193
+ },
+ {
+ "epoch": 1.7017543859649122,
+ "grad_norm": 0.5160980224609375,
+ "learning_rate": 4.687128574296912e-06,
+ "loss": 0.1073,
+ "step": 194
+ },
+ {
+ "epoch": 1.7105263157894737,
+ "grad_norm": 1.5724798440933228,
+ "learning_rate": 4.680582575007303e-06,
+ "loss": 0.121,
+ "step": 195
+ },
+ {
+ "epoch": 1.719298245614035,
+ "grad_norm": 1.3960011005401611,
+ "learning_rate": 4.6739734733665275e-06,
+ "loss": 0.1145,
+ "step": 196
+ },
+ {
+ "epoch": 1.7280701754385965,
+ "grad_norm": 1.4949183464050293,
+ "learning_rate": 4.6673014606307465e-06,
+ "loss": 0.1166,
+ "step": 197
+ },
+ {
+ "epoch": 1.736842105263158,
+ "grad_norm": 1.6873422861099243,
+ "learning_rate": 4.660566729876661e-06,
+ "loss": 0.1115,
+ "step": 198
+ },
+ {
+ "epoch": 1.7456140350877192,
+ "grad_norm": 1.3443641662597656,
+ "learning_rate": 4.653769475995926e-06,
+ "loss": 0.1119,
+ "step": 199
+ },
+ {
+ "epoch": 1.7543859649122808,
+ "grad_norm": 0.807525098323822,
+ "learning_rate": 4.646909895689508e-06,
+ "loss": 0.1059,
+ "step": 200
+ },
+ {
+ "epoch": 1.763157894736842,
+ "grad_norm": 1.589316964149475,
+ "learning_rate": 4.639988187461995e-06,
+ "loss": 0.1151,
+ "step": 201
+ },
+ {
+ "epoch": 1.7719298245614035,
+ "grad_norm": 2.474756956100464,
+ "learning_rate": 4.633004551615851e-06,
+ "loss": 0.116,
+ "step": 202
+ },
+ {
+ "epoch": 1.780701754385965,
+ "grad_norm": 0.6210195422172546,
+ "learning_rate": 4.62595919024562e-06,
+ "loss": 0.1097,
+ "step": 203
+ },
+ {
+ "epoch": 1.7894736842105263,
+ "grad_norm": 0.7217905521392822,
+ "learning_rate": 4.618852307232078e-06,
+ "loss": 0.1117,
+ "step": 204
+ },
+ {
+ "epoch": 1.7982456140350878,
+ "grad_norm": 1.551251769065857,
+ "learning_rate": 4.611684108236334e-06,
+ "loss": 0.113,
+ "step": 205
+ },
+ {
+ "epoch": 1.807017543859649,
+ "grad_norm": 0.6619828939437866,
+ "learning_rate": 4.604454800693874e-06,
+ "loss": 0.113,
+ "step": 206
+ },
+ {
+ "epoch": 1.8157894736842106,
+ "grad_norm": 0.9461805820465088,
+ "learning_rate": 4.597164593808564e-06,
+ "loss": 0.1093,
+ "step": 207
+ },
+ {
+ "epoch": 1.8245614035087718,
+ "grad_norm": 1.2926547527313232,
+ "learning_rate": 4.589813698546592e-06,
+ "loss": 0.1128,
+ "step": 208
+ },
+ {
+ "epoch": 1.8333333333333335,
+ "grad_norm": 0.8754212856292725,
+ "learning_rate": 4.582402327630368e-06,
+ "loss": 0.1104,
+ "step": 209
+ },
+ {
+ "epoch": 1.8421052631578947,
+ "grad_norm": 0.846051812171936,
+ "learning_rate": 4.574930695532357e-06,
+ "loss": 0.1105,
+ "step": 210
+ },
+ {
+ "epoch": 1.8508771929824561,
+ "grad_norm": 1.3332515954971313,
+ "learning_rate": 4.567399018468889e-06,
+ "loss": 0.1101,
+ "step": 211
+ },
+ {
+ "epoch": 1.8596491228070176,
+ "grad_norm": 0.8729192614555359,
+ "learning_rate": 4.5598075143938855e-06,
+ "loss": 0.1081,
+ "step": 212
+ },
+ {
+ "epoch": 1.868421052631579,
+ "grad_norm": 0.8618345260620117,
+ "learning_rate": 4.552156402992567e-06,
+ "loss": 0.1059,
+ "step": 213
+ },
+ {
+ "epoch": 1.8771929824561404,
+ "grad_norm": 1.2135930061340332,
+ "learning_rate": 4.544445905675082e-06,
+ "loss": 0.1105,
+ "step": 214
+ },
+ {
+ "epoch": 1.8859649122807016,
+ "grad_norm": 0.8405666351318359,
+ "learning_rate": 4.536676245570111e-06,
+ "loss": 0.1118,
+ "step": 215
+ },
+ {
+ "epoch": 1.8947368421052633,
+ "grad_norm": 0.42860639095306396,
+ "learning_rate": 4.528847647518403e-06,
+ "loss": 0.1093,
+ "step": 216
+ },
+ {
+ "epoch": 1.9035087719298245,
+ "grad_norm": 1.1538206338882446,
+ "learning_rate": 4.520960338066271e-06,
+ "loss": 0.1088,
+ "step": 217
+ },
+ {
+ "epoch": 1.912280701754386,
+ "grad_norm": 0.5870749354362488,
+ "learning_rate": 4.513014545459038e-06,
+ "loss": 0.1061,
+ "step": 218
+ },
+ {
+ "epoch": 1.9210526315789473,
+ "grad_norm": 0.7279748916625977,
+ "learning_rate": 4.505010499634427e-06,
+ "loss": 0.1032,
+ "step": 219
+ },
+ {
+ "epoch": 1.9298245614035088,
+ "grad_norm": 0.6331414580345154,
+ "learning_rate": 4.4969484322159125e-06,
+ "loss": 0.1109,
+ "step": 220
+ },
+ {
+ "epoch": 1.9385964912280702,
+ "grad_norm": 0.9024543166160583,
+ "learning_rate": 4.488828576506014e-06,
+ "loss": 0.1094,
+ "step": 221
+ },
+ {
+ "epoch": 1.9473684210526314,
+ "grad_norm": 3.540376901626587,
+ "learning_rate": 4.480651167479545e-06,
+ "loss": 0.1154,
+ "step": 222
+ },
+ {
+ "epoch": 1.956140350877193,
+ "grad_norm": 0.9506739377975464,
+ "learning_rate": 4.472416441776817e-06,
+ "loss": 0.108,
+ "step": 223
+ },
+ {
+ "epoch": 1.9649122807017543,
+ "grad_norm": 0.6585081815719604,
+ "learning_rate": 4.464124637696786e-06,
+ "loss": 0.1033,
+ "step": 224
+ },
+ {
+ "epoch": 1.973684210526316,
+ "grad_norm": 1.143038034439087,
+ "learning_rate": 4.455775995190161e-06,
+ "loss": 0.1092,
+ "step": 225
+ },
+ {
+ "epoch": 1.9824561403508771,
+ "grad_norm": 1.148261547088623,
+ "learning_rate": 4.4473707558524555e-06,
+ "loss": 0.1076,
+ "step": 226
+ },
+ {
+ "epoch": 1.9912280701754386,
+ "grad_norm": 0.7375811338424683,
+ "learning_rate": 4.438909162917003e-06,
+ "loss": 0.108,
+ "step": 227
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.5254591703414917,
+ "learning_rate": 4.430391461247911e-06,
+ "loss": 0.1079,
+ "step": 228
+ },
+ {
+ "epoch": 2.008771929824561,
+ "grad_norm": 1.0198495388031006,
+ "learning_rate": 4.42181789733298e-06,
+ "loss": 0.1083,
+ "step": 229
+ },
+ {
+ "epoch": 2.017543859649123,
+ "grad_norm": 0.9234157800674438,
+ "learning_rate": 4.413188719276569e-06,
+ "loss": 0.1084,
+ "step": 230
+ },
+ {
+ "epoch": 2.026315789473684,
+ "grad_norm": 0.5215068459510803,
+ "learning_rate": 4.404504176792414e-06,
+ "loss": 0.1067,
+ "step": 231
+ },
+ {
+ "epoch": 2.0350877192982457,
+ "grad_norm": 0.9296736121177673,
+ "learning_rate": 4.3957645211964065e-06,
+ "loss": 0.1066,
+ "step": 232
+ },
+ {
+ "epoch": 2.043859649122807,
+ "grad_norm": 0.8660671710968018,
+ "learning_rate": 4.386970005399314e-06,
+ "loss": 0.108,
+ "step": 233
+ },
+ {
+ "epoch": 2.0526315789473686,
+ "grad_norm": 0.6014883518218994,
+ "learning_rate": 4.378120883899467e-06,
+ "loss": 0.1068,
+ "step": 234
+ },
+ {
+ "epoch": 2.06140350877193,
+ "grad_norm": 0.6370371580123901,
+ "learning_rate": 4.369217412775393e-06,
+ "loss": 0.1076,
+ "step": 235
+ },
+ {
+ "epoch": 2.0701754385964914,
+ "grad_norm": 0.9806828498840332,
+ "learning_rate": 4.360259849678402e-06,
+ "loss": 0.1071,
+ "step": 236
+ },
+ {
+ "epoch": 2.0789473684210527,
+ "grad_norm": 0.6093440651893616,
+ "learning_rate": 4.351248453825137e-06,
+ "loss": 0.1038,
+ "step": 237
+ },
+ {
+ "epoch": 2.087719298245614,
+ "grad_norm": 1.3494842052459717,
+ "learning_rate": 4.3421834859900695e-06,
+ "loss": 0.1105,
+ "step": 238
+ },
+ {
+ "epoch": 2.0964912280701755,
+ "grad_norm": 0.7621576189994812,
+ "learning_rate": 4.333065208497949e-06,
+ "loss": 0.1048,
+ "step": 239
+ },
+ {
+ "epoch": 2.1052631578947367,
+ "grad_norm": 0.5918282866477966,
+ "learning_rate": 4.3238938852162195e-06,
+ "loss": 0.1086,
+ "step": 240
+ },
+ {
+ "epoch": 2.1140350877192984,
+ "grad_norm": 0.7048676609992981,
+ "learning_rate": 4.314669781547379e-06,
+ "loss": 0.1061,
+ "step": 241
+ },
+ {
+ "epoch": 2.1228070175438596,
+ "grad_norm": 1.0750821828842163,
+ "learning_rate": 4.305393164421301e-06,
+ "loss": 0.1082,
+ "step": 242
+ },
+ {
+ "epoch": 2.1315789473684212,
+ "grad_norm": 0.6171414852142334,
+ "learning_rate": 4.296064302287507e-06,
+ "loss": 0.1039,
+ "step": 243
+ },
+ {
+ "epoch": 2.1403508771929824,
+ "grad_norm": 0.8080905079841614,
+ "learning_rate": 4.286683465107403e-06,
+ "loss": 0.1069,
+ "step": 244
+ },
+ {
+ "epoch": 2.1491228070175437,
+ "grad_norm": 0.5281466245651245,
+ "learning_rate": 4.277250924346461e-06,
+ "loss": 0.1069,
+ "step": 245
+ },
+ {
+ "epoch": 2.1578947368421053,
+ "grad_norm": 0.8070254325866699,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.1061,
+ "step": 246
+ },
+ {
+ "epoch": 2.1666666666666665,
+ "grad_norm": 0.8560577630996704,
+ "learning_rate": 4.25823182541713e-06,
+ "loss": 0.1116,
+ "step": 247
+ },
+ {
+ "epoch": 2.175438596491228,
+ "grad_norm": 0.7772330045700073,
+ "learning_rate": 4.2486458176291176e-06,
+ "loss": 0.1092,
+ "step": 248
+ },
+ {
+ "epoch": 2.1842105263157894,
+ "grad_norm": 0.814601719379425,
+ "learning_rate": 4.239009207005096e-06,
+ "loss": 0.1093,
+ "step": 249
+ },
+ {
+ "epoch": 2.192982456140351,
+ "grad_norm": 0.957789957523346,
+ "learning_rate": 4.2293222724121855e-06,
+ "loss": 0.1075,
+ "step": 250
+ },
+ {
+ "epoch": 2.2017543859649122,
+ "grad_norm": 0.500062108039856,
+ "learning_rate": 4.219585294173799e-06,
+ "loss": 0.1048,
+ "step": 251
+ },
+ {
+ "epoch": 2.2105263157894735,
+ "grad_norm": 0.3866419792175293,
+ "learning_rate": 4.209798554061527e-06,
+ "loss": 0.1074,
+ "step": 252
+ },
+ {
+ "epoch": 2.219298245614035,
+ "grad_norm": 1.1853291988372803,
+ "learning_rate": 4.199962335286985e-06,
+ "loss": 0.1076,
+ "step": 253
+ },
+ {
+ "epoch": 2.2280701754385963,
+ "grad_norm": 0.36602887511253357,
+ "learning_rate": 4.1900769224936125e-06,
+ "loss": 0.108,
+ "step": 254
+ },
+ {
+ "epoch": 2.236842105263158,
+ "grad_norm": 0.2530711889266968,
+ "learning_rate": 4.180142601748447e-06,
+ "loss": 0.1041,
+ "step": 255
+ },
+ {
+ "epoch": 2.245614035087719,
+ "grad_norm": 1.3067054748535156,
+ "learning_rate": 4.170159660533834e-06,
+ "loss": 0.1087,
+ "step": 256
+ },
+ {
+ "epoch": 2.254385964912281,
+ "grad_norm": 0.3442043960094452,
+ "learning_rate": 4.160128387739114e-06,
+ "loss": 0.1099,
+ "step": 257
+ },
+ {
+ "epoch": 2.263157894736842,
+ "grad_norm": 1.174796462059021,
+ "learning_rate": 4.150049073652262e-06,
+ "loss": 0.1063,
+ "step": 258
+ },
+ {
+ "epoch": 2.2719298245614037,
+ "grad_norm": 0.5719411969184875,
+ "learning_rate": 4.1399220099514845e-06,
+ "loss": 0.1043,
+ "step": 259
+ },
+ {
+ "epoch": 2.280701754385965,
+ "grad_norm": 0.7268956303596497,
+ "learning_rate": 4.129747489696781e-06,
+ "loss": 0.1038,
+ "step": 260
+ },
+ {
+ "epoch": 2.2894736842105265,
+ "grad_norm": 0.7028316259384155,
+ "learning_rate": 4.119525807321467e-06,
+ "loss": 0.1052,
+ "step": 261
+ },
+ {
+ "epoch": 2.2982456140350878,
+ "grad_norm": 1.015335202217102,
+ "learning_rate": 4.109257258623644e-06,
+ "loss": 0.1116,
+ "step": 262
+ },
+ {
+ "epoch": 2.307017543859649,
+ "grad_norm": 0.7141755819320679,
+ "learning_rate": 4.098942140757646e-06,
+ "loss": 0.108,
+ "step": 263
+ },
+ {
+ "epoch": 2.3157894736842106,
+ "grad_norm": 0.7656403183937073,
+ "learning_rate": 4.0885807522254435e-06,
+ "loss": 0.1043,
+ "step": 264
+ },
+ {
+ "epoch": 2.324561403508772,
+ "grad_norm": 0.43293774127960205,
+ "learning_rate": 4.078173392867998e-06,
+ "loss": 0.1048,
+ "step": 265
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 0.6755763292312622,
+ "learning_rate": 4.0677203638565895e-06,
+ "loss": 0.1064,
+ "step": 266
+ },
+ {
+ "epoch": 2.3421052631578947,
+ "grad_norm": 0.9648827314376831,
+ "learning_rate": 4.0572219676841e-06,
+ "loss": 0.1088,
+ "step": 267
+ },
+ {
+ "epoch": 2.3508771929824563,
+ "grad_norm": 0.32724836468696594,
+ "learning_rate": 4.046678508156259e-06,
+ "loss": 0.1077,
+ "step": 268
+ },
+ {
+ "epoch": 2.3596491228070176,
+ "grad_norm": 0.4696657061576843,
+ "learning_rate": 4.036090290382855e-06,
+ "loss": 0.1067,
+ "step": 269
+ },
+ {
+ "epoch": 2.3684210526315788,
+ "grad_norm": 0.33901306986808777,
+ "learning_rate": 4.025457620768901e-06,
+ "loss": 0.105,
+ "step": 270
+ },
+ {
+ "epoch": 2.3771929824561404,
+ "grad_norm": 0.5703794360160828,
+ "learning_rate": 4.014780807005775e-06,
+ "loss": 0.1033,
+ "step": 271
+ },
+ {
+ "epoch": 2.3859649122807016,
+ "grad_norm": 0.9639355540275574,
+ "learning_rate": 4.004060158062306e-06,
+ "loss": 0.1041,
+ "step": 272
+ },
+ {
+ "epoch": 2.3947368421052633,
+ "grad_norm": 0.8851558566093445,
+ "learning_rate": 3.993295984175845e-06,
+ "loss": 0.1064,
+ "step": 273
+ },
+ {
+ "epoch": 2.4035087719298245,
+ "grad_norm": 0.5200062990188599,
+ "learning_rate": 3.982488596843276e-06,
+ "loss": 0.1056,
+ "step": 274
+ },
+ {
+ "epoch": 2.412280701754386,
+ "grad_norm": 1.160823106765747,
+ "learning_rate": 3.971638308812007e-06,
+ "loss": 0.1069,
+ "step": 275
+ },
+ {
+ "epoch": 2.4210526315789473,
+ "grad_norm": 1.0191210508346558,
+ "learning_rate": 3.9607454340709215e-06,
+ "loss": 0.1042,
+ "step": 276
+ },
+ {
+ "epoch": 2.4298245614035086,
+ "grad_norm": 0.37181487679481506,
+ "learning_rate": 3.949810287841289e-06,
+ "loss": 0.1062,
+ "step": 277
+ },
+ {
+ "epoch": 2.43859649122807,
+ "grad_norm": 0.9328593611717224,
+ "learning_rate": 3.9388331865676436e-06,
+ "loss": 0.1086,
+ "step": 278
+ },
+ {
+ "epoch": 2.4473684210526314,
+ "grad_norm": 0.8024734258651733,
+ "learning_rate": 3.927814447908625e-06,
+ "loss": 0.1051,
+ "step": 279
+ },
+ {
+ "epoch": 2.456140350877193,
+ "grad_norm": 0.9746696352958679,
+ "learning_rate": 3.916754390727795e-06,
+ "loss": 0.1041,
+ "step": 280
+ },
+ {
+ "epoch": 2.4649122807017543,
+ "grad_norm": 0.5457844138145447,
+ "learning_rate": 3.905653335084394e-06,
+ "loss": 0.1052,
+ "step": 281
+ },
+ {
+ "epoch": 2.473684210526316,
+ "grad_norm": 1.0736924409866333,
+ "learning_rate": 3.8945116022240945e-06,
+ "loss": 0.1075,
+ "step": 282
+ },
+ {
+ "epoch": 2.482456140350877,
+ "grad_norm": 0.6335628032684326,
+ "learning_rate": 3.8833295145696964e-06,
+ "loss": 0.1036,
+ "step": 283
+ },
+ {
+ "epoch": 2.4912280701754383,
+ "grad_norm": 0.6909618377685547,
+ "learning_rate": 3.872107395711799e-06,
+ "loss": 0.1089,
+ "step": 284
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 2.1871702671051025,
+ "learning_rate": 3.860845570399435e-06,
+ "loss": 0.1066,
+ "step": 285
+ },
+ {
+ "epoch": 2.5087719298245617,
+ "grad_norm": 0.5831722617149353,
+ "learning_rate": 3.849544364530678e-06,
+ "loss": 0.1055,
+ "step": 286
+ },
+ {
+ "epoch": 2.517543859649123,
+ "grad_norm": 0.5302637815475464,
+ "learning_rate": 3.838204105143204e-06,
+ "loss": 0.1057,
+ "step": 287
+ },
+ {
+ "epoch": 2.526315789473684,
+ "grad_norm": 0.6348035931587219,
+ "learning_rate": 3.8268251204048335e-06,
+ "loss": 0.1089,
+ "step": 288
+ },
+ {
+ "epoch": 2.5350877192982457,
+ "grad_norm": 2.1932008266448975,
+ "learning_rate": 3.815407739604033e-06,
+ "loss": 0.1043,
+ "step": 289
+ },
+ {
+ "epoch": 2.543859649122807,
+ "grad_norm": 0.4388940930366516,
+ "learning_rate": 3.803952293140385e-06,
+ "loss": 0.1055,
+ "step": 290
+ },
+ {
+ "epoch": 2.5526315789473686,
+ "grad_norm": 0.6853339076042175,
+ "learning_rate": 3.7924591125150265e-06,
+ "loss": 0.1036,
+ "step": 291
+ },
+ {
+ "epoch": 2.56140350877193,
+ "grad_norm": 0.34744876623153687,
+ "learning_rate": 3.78092853032106e-06,
+ "loss": 0.1025,
+ "step": 292
+ },
+ {
+ "epoch": 2.5701754385964914,
+ "grad_norm": 0.9523847699165344,
+ "learning_rate": 3.769360880233922e-06,
+ "loss": 0.1067,
+ "step": 293
+ },
+ {
+ "epoch": 2.5789473684210527,
+ "grad_norm": 1.303745985031128,
+ "learning_rate": 3.7577564970017338e-06,
+ "loss": 0.1082,
+ "step": 294
+ },
+ {
+ "epoch": 2.587719298245614,
+ "grad_norm": 0.9468981623649597,
+ "learning_rate": 3.7461157164356103e-06,
+ "loss": 0.1055,
+ "step": 295
+ },
+ {
+ "epoch": 2.5964912280701755,
+ "grad_norm": 0.7204175591468811,
+ "learning_rate": 3.7344388753999434e-06,
+ "loss": 0.1055,
+ "step": 296
+ },
+ {
+ "epoch": 2.6052631578947367,
+ "grad_norm": 0.5110165476799011,
+ "learning_rate": 3.7227263118026537e-06,
+ "loss": 0.1092,
+ "step": 297
+ },
+ {
+ "epoch": 2.6140350877192984,
+ "grad_norm": 0.6483246088027954,
+ "learning_rate": 3.7109783645854116e-06,
+ "loss": 0.1078,
+ "step": 298
+ },
+ {
+ "epoch": 2.6228070175438596,
+ "grad_norm": 0.5058422684669495,
+ "learning_rate": 3.699195373713831e-06,
+ "loss": 0.1073,
+ "step": 299
+ },
+ {
+ "epoch": 2.6315789473684212,
+ "grad_norm": 0.4123518764972687,
+ "learning_rate": 3.6873776801676265e-06,
+ "loss": 0.1053,
+ "step": 300
+ },
+ {
+ "epoch": 2.6403508771929824,
+ "grad_norm": 1.0864709615707397,
+ "learning_rate": 3.675525625930751e-06,
+ "loss": 0.1048,
+ "step": 301
+ },
+ {
+ "epoch": 2.6491228070175437,
+ "grad_norm": 1.0264904499053955,
+ "learning_rate": 3.6636395539814975e-06,
+ "loss": 0.1059,
+ "step": 302
+ },
+ {
+ "epoch": 2.6578947368421053,
+ "grad_norm": 0.7724822163581848,
+ "learning_rate": 3.651719808282573e-06,
+ "loss": 0.1063,
+ "step": 303
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 0.7474755644798279,
+ "learning_rate": 3.6397667337711475e-06,
+ "loss": 0.1034,
+ "step": 304
+ },
+ {
+ "epoch": 2.675438596491228,
+ "grad_norm": 0.5628909468650818,
+ "learning_rate": 3.6277806763488666e-06,
+ "loss": 0.1026,
+ "step": 305
+ },
+ {
+ "epoch": 2.6842105263157894,
+ "grad_norm": 0.9070547819137573,
+ "learning_rate": 3.6157619828718477e-06,
+ "loss": 0.1031,
+ "step": 306
+ },
+ {
+ "epoch": 2.692982456140351,
+ "grad_norm": 0.6968091130256653,
+ "learning_rate": 3.603711001140641e-06,
+ "loss": 0.1068,
+ "step": 307
+ },
+ {
+ "epoch": 2.7017543859649122,
+ "grad_norm": 0.3764977753162384,
+ "learning_rate": 3.5916280798901604e-06,
+ "loss": 0.1038,
+ "step": 308
+ },
+ {
+ "epoch": 2.7105263157894735,
+ "grad_norm": 5.012625694274902,
+ "learning_rate": 3.5795135687795984e-06,
+ "loss": 0.1129,
+ "step": 309
+ },
+ {
+ "epoch": 2.719298245614035,
+ "grad_norm": 0.6745572686195374,
+ "learning_rate": 3.567367818382303e-06,
+ "loss": 0.1071,
+ "step": 310
+ },
+ {
+ "epoch": 2.7280701754385968,
+ "grad_norm": 1.0659606456756592,
+ "learning_rate": 3.555191180175634e-06,
+ "loss": 0.1067,
+ "step": 311
+ },
+ {
+ "epoch": 2.736842105263158,
+ "grad_norm": 1.7312604188919067,
+ "learning_rate": 3.5429840065307924e-06,
+ "loss": 0.1101,
+ "step": 312
+ },
+ {
+ "epoch": 2.745614035087719,
+ "grad_norm": 1.100364327430725,
+ "learning_rate": 3.5307466507026223e-06,
+ "loss": 0.1098,
+ "step": 313
+ },
+ {
+ "epoch": 2.754385964912281,
+ "grad_norm": 1.0390428304672241,
+ "learning_rate": 3.5184794668193893e-06,
+ "loss": 0.1094,
+ "step": 314
+ },
+ {
+ "epoch": 2.763157894736842,
+ "grad_norm": 0.3369971811771393,
+ "learning_rate": 3.5061828098725327e-06,
+ "loss": 0.1053,
+ "step": 315
+ },
+ {
+ "epoch": 2.7719298245614032,
+ "grad_norm": 0.6130257248878479,
+ "learning_rate": 3.4938570357063906e-06,
+ "loss": 0.106,
+ "step": 316
+ },
+ {
+ "epoch": 2.780701754385965,
+ "grad_norm": 0.6387595534324646,
+ "learning_rate": 3.481502501007904e-06,
+ "loss": 0.1044,
+ "step": 317
+ },
+ {
+ "epoch": 2.7894736842105265,
+ "grad_norm": 1.0731587409973145,
+ "learning_rate": 3.469119563296296e-06,
+ "loss": 0.1097,
+ "step": 318
+ },
+ {
+ "epoch": 2.7982456140350878,
+ "grad_norm": 0.8096229434013367,
+ "learning_rate": 3.4567085809127247e-06,
+ "loss": 0.1076,
+ "step": 319
+ },
+ {
+ "epoch": 2.807017543859649,
+ "grad_norm": 0.5034844279289246,
+ "learning_rate": 3.444269913009912e-06,
+ "loss": 0.1071,
+ "step": 320
+ },
+ {
+ "epoch": 2.8157894736842106,
+ "grad_norm": 0.675139307975769,
+ "learning_rate": 3.4318039195417536e-06,
+ "loss": 0.1039,
+ "step": 321
+ },
+ {
+ "epoch": 2.824561403508772,
+ "grad_norm": 0.7330355644226074,
+ "learning_rate": 3.4193109612528972e-06,
+ "loss": 0.1044,
+ "step": 322
+ },
+ {
+ "epoch": 2.8333333333333335,
+ "grad_norm": 0.6558271646499634,
+ "learning_rate": 3.4067913996683115e-06,
+ "loss": 0.1051,
+ "step": 323
+ },
+ {
+ "epoch": 2.8421052631578947,
+ "grad_norm": 0.8411844372749329,
+ "learning_rate": 3.3942455970828146e-06,
+ "loss": 0.1063,
+ "step": 324
+ },
+ {
+ "epoch": 2.8508771929824563,
+ "grad_norm": 0.4817325174808502,
+ "learning_rate": 3.3816739165505964e-06,
+ "loss": 0.105,
+ "step": 325
+ },
+ {
+ "epoch": 2.8596491228070176,
+ "grad_norm": 0.424554705619812,
+ "learning_rate": 3.3690767218747104e-06,
+ "loss": 0.1037,
+ "step": 326
+ },
+ {
+ "epoch": 2.8684210526315788,
+ "grad_norm": 1.0054417848587036,
+ "learning_rate": 3.3564543775965475e-06,
+ "loss": 0.1058,
+ "step": 327
+ },
+ {
+ "epoch": 2.8771929824561404,
+ "grad_norm": 0.8984584808349609,
+ "learning_rate": 3.3438072489852837e-06,
+ "loss": 0.1079,
+ "step": 328
+ },
+ {
+ "epoch": 2.8859649122807016,
+ "grad_norm": 0.6779558062553406,
+ "learning_rate": 3.331135702027311e-06,
+ "loss": 0.1046,
+ "step": 329
+ },
+ {
+ "epoch": 2.8947368421052633,
+ "grad_norm": 0.6931657195091248,
+ "learning_rate": 3.318440103415649e-06,
+ "loss": 0.1106,
+ "step": 330
+ },
+ {
+ "epoch": 2.9035087719298245,
+ "grad_norm": 0.705264151096344,
+ "learning_rate": 3.305720820539329e-06,
+ "loss": 0.104,
+ "step": 331
+ },
+ {
+ "epoch": 2.912280701754386,
+ "grad_norm": 0.7799407839775085,
+ "learning_rate": 3.2929782214727657e-06,
+ "loss": 0.1019,
+ "step": 332
+ },
+ {
+ "epoch": 2.9210526315789473,
+ "grad_norm": 0.7583760619163513,
+ "learning_rate": 3.2802126749651042e-06,
+ "loss": 0.1049,
+ "step": 333
+ },
+ {
+ "epoch": 2.9298245614035086,
+ "grad_norm": 0.6145837306976318,
+ "learning_rate": 3.2674245504295505e-06,
+ "loss": 0.104,
+ "step": 334
+ },
+ {
+ "epoch": 2.93859649122807,
+ "grad_norm": 0.5170779228210449,
+ "learning_rate": 3.254614217932679e-06,
+ "loss": 0.1024,
+ "step": 335
+ },
+ {
+ "epoch": 2.9473684210526314,
+ "grad_norm": 0.6850940585136414,
+ "learning_rate": 3.241782048183726e-06,
+ "loss": 0.1047,
+ "step": 336
+ },
+ {
+ "epoch": 2.956140350877193,
+ "grad_norm": 0.7307694554328918,
+ "learning_rate": 3.2289284125238597e-06,
+ "loss": 0.1032,
+ "step": 337
+ },
+ {
+ "epoch": 2.9649122807017543,
+ "grad_norm": 0.3386179208755493,
+ "learning_rate": 3.216053682915436e-06,
+ "loss": 0.1037,
+ "step": 338
+ },
+ {
+ "epoch": 2.973684210526316,
+ "grad_norm": 0.7565059065818787,
+ "learning_rate": 3.203158231931234e-06,
+ "loss": 0.1048,
+ "step": 339
+ },
+ {
+ "epoch": 2.982456140350877,
+ "grad_norm": 0.7902039289474487,
+ "learning_rate": 3.190242432743673e-06,
+ "loss": 0.1068,
+ "step": 340
+ },
+ {
+ "epoch": 2.9912280701754383,
+ "grad_norm": 0.42595192790031433,
+ "learning_rate": 3.177306659114015e-06,
+ "loss": 0.1039,
+ "step": 341
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.1214542388916016,
+ "learning_rate": 3.164351285381549e-06,
+ "loss": 0.1062,
+ "step": 342
+ },
+ {
+ "epoch": 3.008771929824561,
+ "grad_norm": 0.7622955441474915,
+ "learning_rate": 3.1513766864527577e-06,
+ "loss": 0.1015,
+ "step": 343
+ },
+ {
+ "epoch": 3.017543859649123,
+ "grad_norm": 0.2676297724246979,
+ "learning_rate": 3.1383832377904676e-06,
+ "loss": 0.1037,
+ "step": 344
+ },
+ {
+ "epoch": 3.026315789473684,
+ "grad_norm": 0.8695605397224426,
+ "learning_rate": 3.1253713154029857e-06,
+ "loss": 0.1056,
+ "step": 345
+ },
+ {
+ "epoch": 3.0350877192982457,
+ "grad_norm": 0.5875906944274902,
+ "learning_rate": 3.1123412958332155e-06,
+ "loss": 0.1067,
+ "step": 346
+ },
+ {
+ "epoch": 3.043859649122807,
+ "grad_norm": 0.7699372172355652,
+ "learning_rate": 3.0992935561477632e-06,
+ "loss": 0.1035,
+ "step": 347
+ },
+ {
+ "epoch": 3.0526315789473686,
+ "grad_norm": 0.5919204354286194,
+ "learning_rate": 3.0862284739260247e-06,
+ "loss": 0.1023,
+ "step": 348
+ },
+ {
+ "epoch": 3.06140350877193,
+ "grad_norm": 1.3211849927902222,
+ "learning_rate": 3.07314642724926e-06,
+ "loss": 0.1065,
+ "step": 349
+ },
+ {
+ "epoch": 3.0701754385964914,
+ "grad_norm": 0.6359637379646301,
+ "learning_rate": 3.0600477946896494e-06,
+ "loss": 0.106,
+ "step": 350
+ },
+ {
+ "epoch": 3.0789473684210527,
+ "grad_norm": 0.35776662826538086,
+ "learning_rate": 3.046932955299344e-06,
+ "loss": 0.1046,
+ "step": 351
+ },
+ {
+ "epoch": 3.087719298245614,
+ "grad_norm": 0.6657406687736511,
+ "learning_rate": 3.0338022885994904e-06,
+ "loss": 0.1076,
+ "step": 352
+ },
+ {
+ "epoch": 3.0964912280701755,
+ "grad_norm": 0.7587785720825195,
+ "learning_rate": 3.0206561745692512e-06,
+ "loss": 0.1043,
+ "step": 353
+ },
+ {
+ "epoch": 3.1052631578947367,
+ "grad_norm": 1.1258317232131958,
+ "learning_rate": 3.0074949936348084e-06,
+ "loss": 0.1043,
+ "step": 354
+ },
+ {
+ "epoch": 3.1140350877192984,
+ "grad_norm": 0.3570568263530731,
+ "learning_rate": 2.9943191266583564e-06,
+ "loss": 0.1032,
+ "step": 355
+ },
+ {
+ "epoch": 3.1228070175438596,
+ "grad_norm": 0.843485414981842,
+ "learning_rate": 2.981128954927075e-06,
+ "loss": 0.1045,
+ "step": 356
+ },
+ {
+ "epoch": 3.1315789473684212,
+ "grad_norm": 0.5719651579856873,
+ "learning_rate": 2.967924860142103e-06,
+ "loss": 0.1052,
+ "step": 357
+ },
+ {
+ "epoch": 3.1403508771929824,
+ "grad_norm": 2.20767879486084,
+ "learning_rate": 2.9547072244074853e-06,
+ "loss": 0.1078,
+ "step": 358
+ },
+ {
+ "epoch": 3.1491228070175437,
+ "grad_norm": 0.3715457022190094,
+ "learning_rate": 2.941476430219122e-06,
+ "loss": 0.1047,
+ "step": 359
+ },
+ {
+ "epoch": 3.1578947368421053,
+ "grad_norm": 0.7803200483322144,
+ "learning_rate": 2.928232860453694e-06,
+ "loss": 0.1029,
+ "step": 360
+ },
+ {
+ "epoch": 3.1666666666666665,
+ "grad_norm": 0.5198164582252502,
+ "learning_rate": 2.9149768983575884e-06,
+ "loss": 0.1032,
+ "step": 361
+ },
+ {
+ "epoch": 3.175438596491228,
+ "grad_norm": 0.7827185988426208,
+ "learning_rate": 2.9017089275358017e-06,
+ "loss": 0.1043,
+ "step": 362
+ },
+ {
+ "epoch": 3.1842105263157894,
+ "grad_norm": 0.4000351130962372,
+ "learning_rate": 2.8884293319408464e-06,
+ "loss": 0.1071,
+ "step": 363
+ },
+ {
+ "epoch": 3.192982456140351,
+ "grad_norm": 0.9913386106491089,
+ "learning_rate": 2.8751384958616318e-06,
+ "loss": 0.1022,
+ "step": 364
+ },
+ {
+ "epoch": 3.2017543859649122,
+ "grad_norm": 0.6975695490837097,
+ "learning_rate": 2.861836803912353e-06,
+ "loss": 0.1029,
+ "step": 365
+ },
+ {
+ "epoch": 3.2105263157894735,
+ "grad_norm": 0.2372695654630661,
+ "learning_rate": 2.8485246410213497e-06,
+ "loss": 0.1015,
+ "step": 366
+ },
+ {
+ "epoch": 3.219298245614035,
+ "grad_norm": 0.447732537984848,
+ "learning_rate": 2.835202392419977e-06,
+ "loss": 0.1052,
+ "step": 367
+ },
+ {
+ "epoch": 3.2280701754385963,
+ "grad_norm": 0.6617346405982971,
+ "learning_rate": 2.8218704436314525e-06,
+ "loss": 0.1055,
+ "step": 368
+ },
+ {
+ "epoch": 3.236842105263158,
+ "grad_norm": 0.5550402402877808,
+ "learning_rate": 2.8085291804596995e-06,
+ "loss": 0.102,
+ "step": 369
+ },
+ {
+ "epoch": 3.245614035087719,
+ "grad_norm": 0.6046020984649658,
+ "learning_rate": 2.795178988978185e-06,
+ "loss": 0.1036,
+ "step": 370
+ },
+ {
+ "epoch": 3.254385964912281,
+ "grad_norm": 0.41890618205070496,
+ "learning_rate": 2.781820255518745e-06,
+ "loss": 0.1036,
+ "step": 371
+ },
+ {
+ "epoch": 3.263157894736842,
+ "grad_norm": 0.8387415409088135,
+ "learning_rate": 2.768453366660408e-06,
+ "loss": 0.1076,
+ "step": 372
+ },
+ {
+ "epoch": 3.2719298245614037,
+ "grad_norm": 0.5318773984909058,
+ "learning_rate": 2.755078709218203e-06,
+ "loss": 0.1052,
+ "step": 373
+ },
+ {
+ "epoch": 3.280701754385965,
+ "grad_norm": 0.6617523431777954,
+ "learning_rate": 2.741696670231969e-06,
+ "loss": 0.1049,
+ "step": 374
+ },
+ {
+ "epoch": 3.2894736842105265,
+ "grad_norm": 1.0190025568008423,
+ "learning_rate": 2.728307636955156e-06,
+ "loss": 0.1034,
+ "step": 375
+ },
+ {
+ "epoch": 3.2982456140350878,
+ "grad_norm": 0.6924716234207153,
+ "learning_rate": 2.714911996843617e-06,
+ "loss": 0.1065,
+ "step": 376
+ },
+ {
+ "epoch": 3.307017543859649,
+ "grad_norm": 0.42501118779182434,
+ "learning_rate": 2.701510137544393e-06,
+ "loss": 0.1019,
+ "step": 377
+ },
+ {
+ "epoch": 3.3157894736842106,
+ "grad_norm": 0.844886064529419,
+ "learning_rate": 2.6881024468845e-06,
+ "loss": 0.1047,
+ "step": 378
+ },
+ {
+ "epoch": 3.324561403508772,
+ "grad_norm": 0.46512728929519653,
+ "learning_rate": 2.674689312859704e-06,
+ "loss": 0.1043,
+ "step": 379
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 0.6242017149925232,
+ "learning_rate": 2.6612711236232915e-06,
+ "loss": 0.1046,
+ "step": 380
+ },
+ {
+ "epoch": 3.3421052631578947,
+ "grad_norm": 0.6578526496887207,
+ "learning_rate": 2.6478482674748375e-06,
+ "loss": 0.1031,
+ "step": 381
+ },
+ {
+ "epoch": 3.3508771929824563,
+ "grad_norm": 0.4822542667388916,
+ "learning_rate": 2.63442113284897e-06,
+ "loss": 0.1053,
+ "step": 382
+ },
+ {
+ "epoch": 3.3596491228070176,
+ "grad_norm": 0.48255595564842224,
+ "learning_rate": 2.6209901083041307e-06,
+ "loss": 0.1058,
+ "step": 383
+ },
+ {
+ "epoch": 3.3684210526315788,
+ "grad_norm": 0.6624025702476501,
+ "learning_rate": 2.6075555825113265e-06,
+ "loss": 0.1066,
+ "step": 384
+ },
+ {
+ "epoch": 3.3771929824561404,
+ "grad_norm": 0.6962618827819824,
+ "learning_rate": 2.5941179442428864e-06,
+ "loss": 0.102,
+ "step": 385
+ },
+ {
+ "epoch": 3.3859649122807016,
+ "grad_norm": 0.4976450502872467,
+ "learning_rate": 2.580677582361208e-06,
+ "loss": 0.1011,
+ "step": 386
+ },
+ {
+ "epoch": 3.3947368421052633,
+ "grad_norm": 0.5283737182617188,
+ "learning_rate": 2.5672348858075053e-06,
+ "loss": 0.1057,
+ "step": 387
+ },
+ {
+ "epoch": 3.4035087719298245,
+ "grad_norm": 0.32338738441467285,
+ "learning_rate": 2.553790243590556e-06,
+ "loss": 0.1015,
+ "step": 388
+ },
+ {
+ "epoch": 3.412280701754386,
+ "grad_norm": 0.7909435629844666,
+ "learning_rate": 2.5403440447754385e-06,
+ "loss": 0.1036,
+ "step": 389
+ },
+ {
+ "epoch": 3.4210526315789473,
+ "grad_norm": 0.6297115087509155,
+ "learning_rate": 2.5268966784722792e-06,
+ "loss": 0.1042,
+ "step": 390
+ },
+ {
+ "epoch": 3.4298245614035086,
+ "grad_norm": 0.32988762855529785,
+ "learning_rate": 2.513448533824988e-06,
+ "loss": 0.1059,
+ "step": 391
+ },
+ {
+ "epoch": 3.43859649122807,
+ "grad_norm": 0.9211220145225525,
+ "learning_rate": 2.5e-06,
+ "loss": 0.1015,
+ "step": 392
+ },
+ {
+ "epoch": 3.4473684210526314,
+ "grad_norm": 1.2157588005065918,
+ "learning_rate": 2.486551466175013e-06,
+ "loss": 0.1035,
+ "step": 393
+ },
+ {
+ "epoch": 3.456140350877193,
+ "grad_norm": 0.4786648452281952,
+ "learning_rate": 2.4731033215277216e-06,
+ "loss": 0.1026,
+ "step": 394
+ },
+ {
+ "epoch": 3.4649122807017543,
+ "grad_norm": 0.37398242950439453,
+ "learning_rate": 2.4596559552245623e-06,
+ "loss": 0.1044,
+ "step": 395
+ },
+ {
+ "epoch": 3.473684210526316,
+ "grad_norm": 0.5536217093467712,
+ "learning_rate": 2.446209756409445e-06,
+ "loss": 0.1043,
+ "step": 396
+ },
+ {
+ "epoch": 3.482456140350877,
+ "grad_norm": 0.708406925201416,
+ "learning_rate": 2.432765114192495e-06,
+ "loss": 0.1046,
+ "step": 397
+ },
+ {
+ "epoch": 3.4912280701754383,
+ "grad_norm": 0.7140893340110779,
+ "learning_rate": 2.4193224176387926e-06,
+ "loss": 0.1039,
+ "step": 398
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.8078088760375977,
+ "learning_rate": 2.4058820557571144e-06,
+ "loss": 0.1013,
+ "step": 399
+ },
+ {
+ "epoch": 3.5087719298245617,
+ "grad_norm": 0.7129591107368469,
+ "learning_rate": 2.3924444174886735e-06,
+ "loss": 0.1057,
+ "step": 400
+ },
+ {
+ "epoch": 3.517543859649123,
+ "grad_norm": 1.293412446975708,
+ "learning_rate": 2.37900989169587e-06,
+ "loss": 0.1081,
+ "step": 401
+ },
+ {
+ "epoch": 3.526315789473684,
+ "grad_norm": 0.7235314249992371,
+ "learning_rate": 2.3655788671510314e-06,
+ "loss": 0.1054,
+ "step": 402
+ },
+ {
+ "epoch": 3.5350877192982457,
+ "grad_norm": 0.6008841395378113,
+ "learning_rate": 2.3521517325251637e-06,
+ "loss": 0.1033,
+ "step": 403
+ },
+ {
+ "epoch": 3.543859649122807,
+ "grad_norm": 0.6819609999656677,
+ "learning_rate": 2.3387288763767097e-06,
+ "loss": 0.1019,
+ "step": 404
+ },
+ {
+ "epoch": 3.5526315789473686,
+ "grad_norm": 0.5696406960487366,
+ "learning_rate": 2.325310687140296e-06,
+ "loss": 0.1043,
+ "step": 405
+ },
+ {
+ "epoch": 3.56140350877193,
+ "grad_norm": 0.8597077131271362,
+ "learning_rate": 2.3118975531155003e-06,
+ "loss": 0.1037,
+ "step": 406
+ },
+ {
+ "epoch": 3.5701754385964914,
+ "grad_norm": 0.43985217809677124,
+ "learning_rate": 2.2984898624556075e-06,
+ "loss": 0.105,
+ "step": 407
+ },
+ {
+ "epoch": 3.5789473684210527,
+ "grad_norm": 0.5448469519615173,
+ "learning_rate": 2.2850880031563845e-06,
+ "loss": 0.1037,
+ "step": 408
+ },
+ {
+ "epoch": 3.587719298245614,
+ "grad_norm": 0.8221977949142456,
+ "learning_rate": 2.271692363044845e-06,
+ "loss": 0.1015,
+ "step": 409
+ },
+ {
+ "epoch": 3.5964912280701755,
+ "grad_norm": 0.9838594198226929,
+ "learning_rate": 2.2583033297680316e-06,
+ "loss": 0.1085,
+ "step": 410
+ },
+ {
+ "epoch": 3.6052631578947367,
+ "grad_norm": 1.034848928451538,
+ "learning_rate": 2.2449212907817985e-06,
+ "loss": 0.104,
+ "step": 411
+ },
+ {
+ "epoch": 3.6140350877192984,
+ "grad_norm": 1.0788371562957764,
+ "learning_rate": 2.2315466333395927e-06,
+ "loss": 0.1033,
+ "step": 412
+ },
+ {
+ "epoch": 3.6228070175438596,
+ "grad_norm": 0.49096915125846863,
+ "learning_rate": 2.2181797444812557e-06,
+ "loss": 0.1044,
+ "step": 413
+ },
+ {
+ "epoch": 3.6315789473684212,
+ "grad_norm": 1.309685230255127,
+ "learning_rate": 2.204821011021815e-06,
+ "loss": 0.1036,
+ "step": 414
+ },
+ {
+ "epoch": 3.6403508771929824,
+ "grad_norm": 0.5014146566390991,
+ "learning_rate": 2.191470819540301e-06,
+ "loss": 0.104,
+ "step": 415
+ },
+ {
+ "epoch": 3.6491228070175437,
+ "grad_norm": 0.770470380783081,
+ "learning_rate": 2.178129556368548e-06,
+ "loss": 0.1049,
+ "step": 416
+ },
+ {
+ "epoch": 3.6578947368421053,
+ "grad_norm": 0.4639376699924469,
+ "learning_rate": 2.1647976075800235e-06,
+ "loss": 0.1047,
+ "step": 417
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 1.101885437965393,
+ "learning_rate": 2.151475358978652e-06,
+ "loss": 0.1035,
+ "step": 418
+ },
+ {
+ "epoch": 3.675438596491228,
+ "grad_norm": 0.5644329786300659,
+ "learning_rate": 2.138163196087648e-06,
+ "loss": 0.103,
+ "step": 419
+ },
+ {
+ "epoch": 3.6842105263157894,
+ "grad_norm": 1.1015008687973022,
+ "learning_rate": 2.1248615041383686e-06,
+ "loss": 0.1054,
+ "step": 420
+ },
+ {
+ "epoch": 3.692982456140351,
+ "grad_norm": 0.7311366200447083,
+ "learning_rate": 2.111570668059155e-06,
+ "loss": 0.1043,
+ "step": 421
+ },
+ {
+ "epoch": 3.7017543859649122,
+ "grad_norm": 0.38242173194885254,
+ "learning_rate": 2.098291072464199e-06,
+ "loss": 0.1041,
+ "step": 422
+ },
+ {
+ "epoch": 3.7105263157894735,
+ "grad_norm": 1.231512188911438,
+ "learning_rate": 2.085023101642412e-06,
+ "loss": 0.1021,
+ "step": 423
+ },
+ {
+ "epoch": 3.719298245614035,
+ "grad_norm": 0.41761213541030884,
+ "learning_rate": 2.0717671395463063e-06,
+ "loss": 0.1062,
+ "step": 424
+ },
+ {
+ "epoch": 3.7280701754385968,
+ "grad_norm": 0.4593309462070465,
+ "learning_rate": 2.0585235697808794e-06,
+ "loss": 0.1012,
+ "step": 425
+ },
+ {
+ "epoch": 3.736842105263158,
+ "grad_norm": 0.9147135019302368,
+ "learning_rate": 2.0452927755925155e-06,
+ "loss": 0.1046,
+ "step": 426
+ },
+ {
+ "epoch": 3.745614035087719,
+ "grad_norm": 0.39639535546302795,
+ "learning_rate": 2.0320751398578984e-06,
+ "loss": 0.1018,
+ "step": 427
+ },
+ {
+ "epoch": 3.754385964912281,
+ "grad_norm": 0.688010573387146,
+ "learning_rate": 2.0188710450729255e-06,
+ "loss": 0.104,
+ "step": 428
+ },
+ {
+ "epoch": 3.763157894736842,
+ "grad_norm": 0.5140353441238403,
+ "learning_rate": 2.005680873341644e-06,
+ "loss": 0.1033,
+ "step": 429
+ },
+ {
+ "epoch": 3.7719298245614032,
+ "grad_norm": 0.5970481634140015,
+ "learning_rate": 1.992505006365191e-06,
+ "loss": 0.1044,
+ "step": 430
+ },
+ {
+ "epoch": 3.780701754385965,
+ "grad_norm": 0.551162838935852,
+ "learning_rate": 1.9793438254307496e-06,
+ "loss": 0.1042,
+ "step": 431
+ },
+ {
+ "epoch": 3.7894736842105265,
+ "grad_norm": 0.5344637632369995,
+ "learning_rate": 1.96619771140051e-06,
+ "loss": 0.1042,
+ "step": 432
+ },
+ {
+ "epoch": 3.7982456140350878,
+ "grad_norm": 0.5357667207717896,
+ "learning_rate": 1.9530670447006566e-06,
+ "loss": 0.101,
+ "step": 433
+ },
+ {
+ "epoch": 3.807017543859649,
+ "grad_norm": 1.2536660432815552,
+ "learning_rate": 1.9399522053103514e-06,
+ "loss": 0.1008,
+ "step": 434
+ },
+ {
+ "epoch": 3.8157894736842106,
+ "grad_norm": 0.4888289272785187,
+ "learning_rate": 1.926853572750741e-06,
+ "loss": 0.1028,
+ "step": 435
+ },
+ {
+ "epoch": 3.824561403508772,
+ "grad_norm": 0.5810404419898987,
+ "learning_rate": 1.913771526073976e-06,
+ "loss": 0.1031,
+ "step": 436
+ },
+ {
+ "epoch": 3.8333333333333335,
+ "grad_norm": 0.5372979044914246,
+ "learning_rate": 1.9007064438522374e-06,
+ "loss": 0.107,
+ "step": 437
+ },
+ {
+ "epoch": 3.8421052631578947,
+ "grad_norm": 0.8293616771697998,
+ "learning_rate": 1.8876587041667855e-06,
+ "loss": 0.1033,
+ "step": 438
+ },
+ {
+ "epoch": 3.8508771929824563,
+ "grad_norm": 2.361504554748535,
+ "learning_rate": 1.8746286845970145e-06,
+ "loss": 0.1098,
+ "step": 439
+ },
+ {
+ "epoch": 3.8596491228070176,
+ "grad_norm": 0.70230633020401,
+ "learning_rate": 1.8616167622095328e-06,
+ "loss": 0.1034,
+ "step": 440
+ },
+ {
+ "epoch": 3.8684210526315788,
+ "grad_norm": 0.6323564052581787,
+ "learning_rate": 1.8486233135472436e-06,
+ "loss": 0.1058,
+ "step": 441
+ },
+ {
+ "epoch": 3.8771929824561404,
+ "grad_norm": 0.48205408453941345,
+ "learning_rate": 1.8356487146184517e-06,
+ "loss": 0.105,
+ "step": 442
+ },
+ {
+ "epoch": 3.8859649122807016,
+ "grad_norm": 0.6996872425079346,
+ "learning_rate": 1.8226933408859864e-06,
+ "loss": 0.1083,
+ "step": 443
+ },
+ {
+ "epoch": 3.8947368421052633,
+ "grad_norm": 0.4114651679992676,
+ "learning_rate": 1.8097575672563278e-06,
+ "loss": 0.1003,
+ "step": 444
+ },
+ {
+ "epoch": 3.9035087719298245,
+ "grad_norm": 0.5234648585319519,
+ "learning_rate": 1.7968417680687666e-06,
+ "loss": 0.1019,
+ "step": 445
+ },
+ {
+ "epoch": 3.912280701754386,
+ "grad_norm": 1.0571491718292236,
+ "learning_rate": 1.7839463170845641e-06,
+ "loss": 0.1003,
+ "step": 446
+ },
+ {
+ "epoch": 3.9210526315789473,
+ "grad_norm": 0.7470094561576843,
+ "learning_rate": 1.7710715874761408e-06,
+ "loss": 0.1061,
+ "step": 447
+ },
+ {
+ "epoch": 3.9298245614035086,
+ "grad_norm": 0.901695191860199,
+ "learning_rate": 1.7582179518162742e-06,
+ "loss": 0.1015,
+ "step": 448
+ },
+ {
+ "epoch": 3.93859649122807,
+ "grad_norm": 1.0251179933547974,
+ "learning_rate": 1.7453857820673215e-06,
+ "loss": 0.1,
+ "step": 449
+ },
+ {
+ "epoch": 3.9473684210526314,
+ "grad_norm": 0.5065406560897827,
+ "learning_rate": 1.7325754495704508e-06,
+ "loss": 0.1036,
+ "step": 450
+ },
+ {
+ "epoch": 3.956140350877193,
+ "grad_norm": 0.9541155099868774,
+ "learning_rate": 1.7197873250348962e-06,
+ "loss": 0.1015,
+ "step": 451
+ },
+ {
+ "epoch": 3.9649122807017543,
+ "grad_norm": 0.6264199018478394,
+ "learning_rate": 1.7070217785272354e-06,
+ "loss": 0.1026,
+ "step": 452
+ },
+ {
+ "epoch": 3.973684210526316,
+ "grad_norm": 0.6260526180267334,
+ "learning_rate": 1.6942791794606716e-06,
+ "loss": 0.1039,
+ "step": 453
+ },
+ {
+ "epoch": 3.982456140350877,
+ "grad_norm": 0.4730931222438812,
+ "learning_rate": 1.681559896584352e-06,
+ "loss": 0.1045,
+ "step": 454
+ },
+ {
+ "epoch": 3.9912280701754383,
+ "grad_norm": 0.5011451840400696,
+ "learning_rate": 1.668864297972689e-06,
+ "loss": 0.1062,
+ "step": 455
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 1.0113046169281006,
+ "learning_rate": 1.6561927510147172e-06,
+ "loss": 0.1005,
+ "step": 456
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.383996029659408e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-456/training_args.bin b/checkpoint-456/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-456/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-456/zero_to_fp32.py b/checkpoint-456/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-456/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-570/README.md b/checkpoint-570/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-570/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-570/adapter_config.json b/checkpoint-570/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-570/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-570/adapter_model.safetensors b/checkpoint-570/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..03e5d913f0cabdef46f28c2746e18c64694aa920
--- /dev/null
+++ b/checkpoint-570/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4bfd3d5f12af03a754a0ee43e020ca5f08d1d2241ff456cfef469e34cf6f2aa
+size 9016826528
diff --git a/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10f08c4495d0763d3292461a287ae16760fbfa34
--- /dev/null
+++ b/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10bc23db9233737e64aa4c5a8bb1fe4760aa94691ce7df815838960f65abe9d2
+size 27050164444
diff --git a/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3d4f593c55f517f9992f81fdf4bb920c6280e62
--- /dev/null
+++ b/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:170708ac3a01d1c0140c7d33c17f2710b074d310e1b3c5468e4c011a7ea23e30
+size 27050169884
diff --git a/checkpoint-570/global_step570/mp_rank_00_model_states.pt b/checkpoint-570/global_step570/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4af4c9a635017c52f7532cb7e03e38f4260c14b5
--- /dev/null
+++ b/checkpoint-570/global_step570/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a54520a329f55ae70fee2fc9270649647c866ec0d5a2e9bed66c2950369245
+size 9776788601
diff --git a/checkpoint-570/latest b/checkpoint-570/latest
new file mode 100644
index 0000000000000000000000000000000000000000..0433d1c81a4b69bdd8533de1f0573850078819c8
--- /dev/null
+++ b/checkpoint-570/latest
@@ -0,0 +1 @@
+global_step570
\ No newline at end of file
diff --git a/checkpoint-570/rng_state_0.pth b/checkpoint-570/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cadea5c4497e157de18771025fb48dd7a47bdfb2
--- /dev/null
+++ b/checkpoint-570/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef06f6fc50741e0a072d30f8d6ef66788bbe7cb3d11d5f3592a9eec58dcbdd1
+size 14512
diff --git a/checkpoint-570/rng_state_1.pth b/checkpoint-570/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..521c7cd5a942c2b3d731a0df2302940e8e1baf65
--- /dev/null
+++ b/checkpoint-570/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b9f4f89dbf7eb3015045d850b8e4485292b7d21154769139ee2c636add2ea3
+size 14512
diff --git a/checkpoint-570/scheduler.pt b/checkpoint-570/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0357e35344f89f2f24bb0d414d635df04fbbd556
--- /dev/null
+++ b/checkpoint-570/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:865e282c4f805f5c50f6c4d4aa455e69a7386950da590f7fd7b70db9aef5414c
+size 1064
diff --git a/checkpoint-570/special_tokens_map.json b/checkpoint-570/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-570/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-570/tokenizer.json b/checkpoint-570/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-570/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-570/tokenizer_config.json b/checkpoint-570/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-570/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-570/trainer_state.json b/checkpoint-570/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2a219421f43a17316af934fac0e081e4a99d61e
--- /dev/null
+++ b/checkpoint-570/trainer_state.json
@@ -0,0 +1,4023 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.0,
+ "eval_steps": 500,
+ "global_step": 570,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ },
+ {
+ "epoch": 1.0087719298245614,
+ "grad_norm": 30.44986343383789,
+ "learning_rate": 4.991865498871647e-06,
+ "loss": 0.2606,
+ "step": 115
+ },
+ {
+ "epoch": 1.0175438596491229,
+ "grad_norm": 14.421515464782715,
+ "learning_rate": 4.99074543733652e-06,
+ "loss": 0.2394,
+ "step": 116
+ },
+ {
+ "epoch": 1.0263157894736843,
+ "grad_norm": 14.072005271911621,
+ "learning_rate": 4.989553297864489e-06,
+ "loss": 0.2288,
+ "step": 117
+ },
+ {
+ "epoch": 1.0350877192982457,
+ "grad_norm": 4.395325660705566,
+ "learning_rate": 4.988289114954045e-06,
+ "loss": 0.2129,
+ "step": 118
+ },
+ {
+ "epoch": 1.043859649122807,
+ "grad_norm": 7.286703586578369,
+ "learning_rate": 4.986952925188489e-06,
+ "loss": 0.186,
+ "step": 119
+ },
+ {
+ "epoch": 1.0526315789473684,
+ "grad_norm": 8.332784652709961,
+ "learning_rate": 4.98554476723488e-06,
+ "loss": 0.178,
+ "step": 120
+ },
+ {
+ "epoch": 1.0614035087719298,
+ "grad_norm": 1.3646447658538818,
+ "learning_rate": 4.984064681842917e-06,
+ "loss": 0.1687,
+ "step": 121
+ },
+ {
+ "epoch": 1.0701754385964912,
+ "grad_norm": 4.494940757751465,
+ "learning_rate": 4.982512711843753e-06,
+ "loss": 0.1881,
+ "step": 122
+ },
+ {
+ "epoch": 1.0789473684210527,
+ "grad_norm": 3.3929836750030518,
+ "learning_rate": 4.980888902148757e-06,
+ "loss": 0.1764,
+ "step": 123
+ },
+ {
+ "epoch": 1.087719298245614,
+ "grad_norm": 1.8281155824661255,
+ "learning_rate": 4.979193299748225e-06,
+ "loss": 0.1602,
+ "step": 124
+ },
+ {
+ "epoch": 1.0964912280701755,
+ "grad_norm": 3.494239568710327,
+ "learning_rate": 4.977425953710005e-06,
+ "loss": 0.1729,
+ "step": 125
+ },
+ {
+ "epoch": 1.1052631578947367,
+ "grad_norm": 1.500410556793213,
+ "learning_rate": 4.975586915178084e-06,
+ "loss": 0.1666,
+ "step": 126
+ },
+ {
+ "epoch": 1.1140350877192982,
+ "grad_norm": 1.4680222272872925,
+ "learning_rate": 4.973676237371111e-06,
+ "loss": 0.159,
+ "step": 127
+ },
+ {
+ "epoch": 1.1228070175438596,
+ "grad_norm": 3.0383460521698,
+ "learning_rate": 4.971693975580851e-06,
+ "loss": 0.1484,
+ "step": 128
+ },
+ {
+ "epoch": 1.131578947368421,
+ "grad_norm": 3.74821138381958,
+ "learning_rate": 4.969640187170591e-06,
+ "loss": 0.1586,
+ "step": 129
+ },
+ {
+ "epoch": 1.1403508771929824,
+ "grad_norm": 4.682602405548096,
+ "learning_rate": 4.967514931573473e-06,
+ "loss": 0.1619,
+ "step": 130
+ },
+ {
+ "epoch": 1.1491228070175439,
+ "grad_norm": 3.90673565864563,
+ "learning_rate": 4.965318270290779e-06,
+ "loss": 0.164,
+ "step": 131
+ },
+ {
+ "epoch": 1.1578947368421053,
+ "grad_norm": 2.2017388343811035,
+ "learning_rate": 4.963050266890152e-06,
+ "loss": 0.1499,
+ "step": 132
+ },
+ {
+ "epoch": 1.1666666666666667,
+ "grad_norm": 2.4211816787719727,
+ "learning_rate": 4.960710987003753e-06,
+ "loss": 0.1387,
+ "step": 133
+ },
+ {
+ "epoch": 1.1754385964912282,
+ "grad_norm": 1.7753759622573853,
+ "learning_rate": 4.958300498326363e-06,
+ "loss": 0.1441,
+ "step": 134
+ },
+ {
+ "epoch": 1.1842105263157894,
+ "grad_norm": 1.5529910326004028,
+ "learning_rate": 4.955818870613425e-06,
+ "loss": 0.1304,
+ "step": 135
+ },
+ {
+ "epoch": 1.1929824561403508,
+ "grad_norm": 2.090593099594116,
+ "learning_rate": 4.953266175679023e-06,
+ "loss": 0.1419,
+ "step": 136
+ },
+ {
+ "epoch": 1.2017543859649122,
+ "grad_norm": 2.7141878604888916,
+ "learning_rate": 4.95064248739381e-06,
+ "loss": 0.1444,
+ "step": 137
+ },
+ {
+ "epoch": 1.2105263157894737,
+ "grad_norm": 2.3690481185913086,
+ "learning_rate": 4.947947881682861e-06,
+ "loss": 0.1383,
+ "step": 138
+ },
+ {
+ "epoch": 1.219298245614035,
+ "grad_norm": 2.2403147220611572,
+ "learning_rate": 4.945182436523482e-06,
+ "loss": 0.1418,
+ "step": 139
+ },
+ {
+ "epoch": 1.2280701754385965,
+ "grad_norm": 1.3939160108566284,
+ "learning_rate": 4.942346231942955e-06,
+ "loss": 0.1307,
+ "step": 140
+ },
+ {
+ "epoch": 1.236842105263158,
+ "grad_norm": 11.276732444763184,
+ "learning_rate": 4.939439350016214e-06,
+ "loss": 0.1397,
+ "step": 141
+ },
+ {
+ "epoch": 1.2456140350877192,
+ "grad_norm": 8.260516166687012,
+ "learning_rate": 4.9364618748634794e-06,
+ "loss": 0.1426,
+ "step": 142
+ },
+ {
+ "epoch": 1.2543859649122808,
+ "grad_norm": 2.09720516204834,
+ "learning_rate": 4.933413892647819e-06,
+ "loss": 0.1323,
+ "step": 143
+ },
+ {
+ "epoch": 1.263157894736842,
+ "grad_norm": 1.802125334739685,
+ "learning_rate": 4.9302954915726535e-06,
+ "loss": 0.1304,
+ "step": 144
+ },
+ {
+ "epoch": 1.2719298245614035,
+ "grad_norm": 1.7151471376419067,
+ "learning_rate": 4.927106761879207e-06,
+ "loss": 0.1264,
+ "step": 145
+ },
+ {
+ "epoch": 1.280701754385965,
+ "grad_norm": 1.6970336437225342,
+ "learning_rate": 4.923847795843894e-06,
+ "loss": 0.1227,
+ "step": 146
+ },
+ {
+ "epoch": 1.2894736842105263,
+ "grad_norm": 16.60441017150879,
+ "learning_rate": 4.920518687775647e-06,
+ "loss": 0.1606,
+ "step": 147
+ },
+ {
+ "epoch": 1.2982456140350878,
+ "grad_norm": 6.470354080200195,
+ "learning_rate": 4.917119534013194e-06,
+ "loss": 0.1447,
+ "step": 148
+ },
+ {
+ "epoch": 1.3070175438596492,
+ "grad_norm": 1.4908231496810913,
+ "learning_rate": 4.913650432922264e-06,
+ "loss": 0.1343,
+ "step": 149
+ },
+ {
+ "epoch": 1.3157894736842106,
+ "grad_norm": 3.19964861869812,
+ "learning_rate": 4.91011148489274e-06,
+ "loss": 0.1354,
+ "step": 150
+ },
+ {
+ "epoch": 1.3245614035087718,
+ "grad_norm": 2.6052839756011963,
+ "learning_rate": 4.906502792335761e-06,
+ "loss": 0.1342,
+ "step": 151
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.0719165802001953,
+ "learning_rate": 4.9028244596807525e-06,
+ "loss": 0.1359,
+ "step": 152
+ },
+ {
+ "epoch": 1.3421052631578947,
+ "grad_norm": 0.8086919784545898,
+ "learning_rate": 4.899076593372405e-06,
+ "loss": 0.1279,
+ "step": 153
+ },
+ {
+ "epoch": 1.3508771929824561,
+ "grad_norm": 1.0056848526000977,
+ "learning_rate": 4.8952593018675955e-06,
+ "loss": 0.1162,
+ "step": 154
+ },
+ {
+ "epoch": 1.3596491228070176,
+ "grad_norm": 5.72553014755249,
+ "learning_rate": 4.891372695632249e-06,
+ "loss": 0.1315,
+ "step": 155
+ },
+ {
+ "epoch": 1.368421052631579,
+ "grad_norm": 1.522894024848938,
+ "learning_rate": 4.887416887138139e-06,
+ "loss": 0.1266,
+ "step": 156
+ },
+ {
+ "epoch": 1.3771929824561404,
+ "grad_norm": 2.019472122192383,
+ "learning_rate": 4.883391990859635e-06,
+ "loss": 0.1262,
+ "step": 157
+ },
+ {
+ "epoch": 1.3859649122807016,
+ "grad_norm": 1.8594422340393066,
+ "learning_rate": 4.879298123270391e-06,
+ "loss": 0.125,
+ "step": 158
+ },
+ {
+ "epoch": 1.3947368421052633,
+ "grad_norm": 1.365377426147461,
+ "learning_rate": 4.8751354028399725e-06,
+ "loss": 0.1218,
+ "step": 159
+ },
+ {
+ "epoch": 1.4035087719298245,
+ "grad_norm": 3.553309917449951,
+ "learning_rate": 4.870903950030429e-06,
+ "loss": 0.1272,
+ "step": 160
+ },
+ {
+ "epoch": 1.412280701754386,
+ "grad_norm": 2.1770920753479004,
+ "learning_rate": 4.866603887292809e-06,
+ "loss": 0.1213,
+ "step": 161
+ },
+ {
+ "epoch": 1.4210526315789473,
+ "grad_norm": 1.6058955192565918,
+ "learning_rate": 4.862235339063613e-06,
+ "loss": 0.1173,
+ "step": 162
+ },
+ {
+ "epoch": 1.4298245614035088,
+ "grad_norm": 1.3208314180374146,
+ "learning_rate": 4.857798431761199e-06,
+ "loss": 0.1183,
+ "step": 163
+ },
+ {
+ "epoch": 1.4385964912280702,
+ "grad_norm": 1.282729983329773,
+ "learning_rate": 4.853293293782118e-06,
+ "loss": 0.1209,
+ "step": 164
+ },
+ {
+ "epoch": 1.4473684210526316,
+ "grad_norm": 1.3838152885437012,
+ "learning_rate": 4.848720055497401e-06,
+ "loss": 0.1198,
+ "step": 165
+ },
+ {
+ "epoch": 1.456140350877193,
+ "grad_norm": 1.2930737733840942,
+ "learning_rate": 4.844078849248785e-06,
+ "loss": 0.1268,
+ "step": 166
+ },
+ {
+ "epoch": 1.4649122807017543,
+ "grad_norm": 1.7022266387939453,
+ "learning_rate": 4.839369809344888e-06,
+ "loss": 0.1198,
+ "step": 167
+ },
+ {
+ "epoch": 1.4736842105263157,
+ "grad_norm": 1.0927815437316895,
+ "learning_rate": 4.834593072057313e-06,
+ "loss": 0.1132,
+ "step": 168
+ },
+ {
+ "epoch": 1.4824561403508771,
+ "grad_norm": 0.9326333999633789,
+ "learning_rate": 4.829748775616716e-06,
+ "loss": 0.1193,
+ "step": 169
+ },
+ {
+ "epoch": 1.4912280701754386,
+ "grad_norm": 1.3564742803573608,
+ "learning_rate": 4.8248370602087954e-06,
+ "loss": 0.118,
+ "step": 170
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 1.19778573513031,
+ "learning_rate": 4.819858067970243e-06,
+ "loss": 0.1122,
+ "step": 171
+ },
+ {
+ "epoch": 1.5087719298245614,
+ "grad_norm": 2.8438351154327393,
+ "learning_rate": 4.814811942984625e-06,
+ "loss": 0.1217,
+ "step": 172
+ },
+ {
+ "epoch": 1.5175438596491229,
+ "grad_norm": 1.0701063871383667,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.1114,
+ "step": 173
+ },
+ {
+ "epoch": 1.526315789473684,
+ "grad_norm": 0.9053553938865662,
+ "learning_rate": 4.804518880815776e-06,
+ "loss": 0.1178,
+ "step": 174
+ },
+ {
+ "epoch": 1.5350877192982457,
+ "grad_norm": 0.42274603247642517,
+ "learning_rate": 4.799272241496259e-06,
+ "loss": 0.1091,
+ "step": 175
+ },
+ {
+ "epoch": 1.543859649122807,
+ "grad_norm": 0.8576470017433167,
+ "learning_rate": 4.793959065148484e-06,
+ "loss": 0.1134,
+ "step": 176
+ },
+ {
+ "epoch": 1.5526315789473686,
+ "grad_norm": 0.5910662412643433,
+ "learning_rate": 4.78857950552674e-06,
+ "loss": 0.1148,
+ "step": 177
+ },
+ {
+ "epoch": 1.5614035087719298,
+ "grad_norm": 0.8761632442474365,
+ "learning_rate": 4.783133718306331e-06,
+ "loss": 0.1125,
+ "step": 178
+ },
+ {
+ "epoch": 1.5701754385964912,
+ "grad_norm": 1.9190795421600342,
+ "learning_rate": 4.777621861079079e-06,
+ "loss": 0.1148,
+ "step": 179
+ },
+ {
+ "epoch": 1.5789473684210527,
+ "grad_norm": 0.6199957728385925,
+ "learning_rate": 4.772044093348757e-06,
+ "loss": 0.1097,
+ "step": 180
+ },
+ {
+ "epoch": 1.587719298245614,
+ "grad_norm": 1.562089443206787,
+ "learning_rate": 4.766400576526479e-06,
+ "loss": 0.1097,
+ "step": 181
+ },
+ {
+ "epoch": 1.5964912280701755,
+ "grad_norm": 1.4957091808319092,
+ "learning_rate": 4.760691473926021e-06,
+ "loss": 0.1216,
+ "step": 182
+ },
+ {
+ "epoch": 1.6052631578947367,
+ "grad_norm": 0.9863570332527161,
+ "learning_rate": 4.754916950759105e-06,
+ "loss": 0.1122,
+ "step": 183
+ },
+ {
+ "epoch": 1.6140350877192984,
+ "grad_norm": 0.5803346633911133,
+ "learning_rate": 4.749077174130609e-06,
+ "loss": 0.1103,
+ "step": 184
+ },
+ {
+ "epoch": 1.6228070175438596,
+ "grad_norm": 1.8789891004562378,
+ "learning_rate": 4.743172313033738e-06,
+ "loss": 0.1191,
+ "step": 185
+ },
+ {
+ "epoch": 1.631578947368421,
+ "grad_norm": 0.8731380105018616,
+ "learning_rate": 4.7372025383451285e-06,
+ "loss": 0.1154,
+ "step": 186
+ },
+ {
+ "epoch": 1.6403508771929824,
+ "grad_norm": 1.3535627126693726,
+ "learning_rate": 4.7311680228199075e-06,
+ "loss": 0.1123,
+ "step": 187
+ },
+ {
+ "epoch": 1.6491228070175439,
+ "grad_norm": 0.7211089134216309,
+ "learning_rate": 4.725068941086693e-06,
+ "loss": 0.1134,
+ "step": 188
+ },
+ {
+ "epoch": 1.6578947368421053,
+ "grad_norm": 1.4752328395843506,
+ "learning_rate": 4.718905469642534e-06,
+ "loss": 0.1185,
+ "step": 189
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 0.9822680354118347,
+ "learning_rate": 4.712677786847814e-06,
+ "loss": 0.1146,
+ "step": 190
+ },
+ {
+ "epoch": 1.6754385964912282,
+ "grad_norm": 1.1308330297470093,
+ "learning_rate": 4.706386072921083e-06,
+ "loss": 0.1061,
+ "step": 191
+ },
+ {
+ "epoch": 1.6842105263157894,
+ "grad_norm": 5.331939697265625,
+ "learning_rate": 4.70003050993384e-06,
+ "loss": 0.1153,
+ "step": 192
+ },
+ {
+ "epoch": 1.692982456140351,
+ "grad_norm": 0.6911673545837402,
+ "learning_rate": 4.6936112818052674e-06,
+ "loss": 0.1098,
+ "step": 193
+ },
+ {
+ "epoch": 1.7017543859649122,
+ "grad_norm": 0.5160980224609375,
+ "learning_rate": 4.687128574296912e-06,
+ "loss": 0.1073,
+ "step": 194
+ },
+ {
+ "epoch": 1.7105263157894737,
+ "grad_norm": 1.5724798440933228,
+ "learning_rate": 4.680582575007303e-06,
+ "loss": 0.121,
+ "step": 195
+ },
+ {
+ "epoch": 1.719298245614035,
+ "grad_norm": 1.3960011005401611,
+ "learning_rate": 4.6739734733665275e-06,
+ "loss": 0.1145,
+ "step": 196
+ },
+ {
+ "epoch": 1.7280701754385965,
+ "grad_norm": 1.4949183464050293,
+ "learning_rate": 4.6673014606307465e-06,
+ "loss": 0.1166,
+ "step": 197
+ },
+ {
+ "epoch": 1.736842105263158,
+ "grad_norm": 1.6873422861099243,
+ "learning_rate": 4.660566729876661e-06,
+ "loss": 0.1115,
+ "step": 198
+ },
+ {
+ "epoch": 1.7456140350877192,
+ "grad_norm": 1.3443641662597656,
+ "learning_rate": 4.653769475995926e-06,
+ "loss": 0.1119,
+ "step": 199
+ },
+ {
+ "epoch": 1.7543859649122808,
+ "grad_norm": 0.807525098323822,
+ "learning_rate": 4.646909895689508e-06,
+ "loss": 0.1059,
+ "step": 200
+ },
+ {
+ "epoch": 1.763157894736842,
+ "grad_norm": 1.589316964149475,
+ "learning_rate": 4.639988187461995e-06,
+ "loss": 0.1151,
+ "step": 201
+ },
+ {
+ "epoch": 1.7719298245614035,
+ "grad_norm": 2.474756956100464,
+ "learning_rate": 4.633004551615851e-06,
+ "loss": 0.116,
+ "step": 202
+ },
+ {
+ "epoch": 1.780701754385965,
+ "grad_norm": 0.6210195422172546,
+ "learning_rate": 4.62595919024562e-06,
+ "loss": 0.1097,
+ "step": 203
+ },
+ {
+ "epoch": 1.7894736842105263,
+ "grad_norm": 0.7217905521392822,
+ "learning_rate": 4.618852307232078e-06,
+ "loss": 0.1117,
+ "step": 204
+ },
+ {
+ "epoch": 1.7982456140350878,
+ "grad_norm": 1.551251769065857,
+ "learning_rate": 4.611684108236334e-06,
+ "loss": 0.113,
+ "step": 205
+ },
+ {
+ "epoch": 1.807017543859649,
+ "grad_norm": 0.6619828939437866,
+ "learning_rate": 4.604454800693874e-06,
+ "loss": 0.113,
+ "step": 206
+ },
+ {
+ "epoch": 1.8157894736842106,
+ "grad_norm": 0.9461805820465088,
+ "learning_rate": 4.597164593808564e-06,
+ "loss": 0.1093,
+ "step": 207
+ },
+ {
+ "epoch": 1.8245614035087718,
+ "grad_norm": 1.2926547527313232,
+ "learning_rate": 4.589813698546592e-06,
+ "loss": 0.1128,
+ "step": 208
+ },
+ {
+ "epoch": 1.8333333333333335,
+ "grad_norm": 0.8754212856292725,
+ "learning_rate": 4.582402327630368e-06,
+ "loss": 0.1104,
+ "step": 209
+ },
+ {
+ "epoch": 1.8421052631578947,
+ "grad_norm": 0.846051812171936,
+ "learning_rate": 4.574930695532357e-06,
+ "loss": 0.1105,
+ "step": 210
+ },
+ {
+ "epoch": 1.8508771929824561,
+ "grad_norm": 1.3332515954971313,
+ "learning_rate": 4.567399018468889e-06,
+ "loss": 0.1101,
+ "step": 211
+ },
+ {
+ "epoch": 1.8596491228070176,
+ "grad_norm": 0.8729192614555359,
+ "learning_rate": 4.5598075143938855e-06,
+ "loss": 0.1081,
+ "step": 212
+ },
+ {
+ "epoch": 1.868421052631579,
+ "grad_norm": 0.8618345260620117,
+ "learning_rate": 4.552156402992567e-06,
+ "loss": 0.1059,
+ "step": 213
+ },
+ {
+ "epoch": 1.8771929824561404,
+ "grad_norm": 1.2135930061340332,
+ "learning_rate": 4.544445905675082e-06,
+ "loss": 0.1105,
+ "step": 214
+ },
+ {
+ "epoch": 1.8859649122807016,
+ "grad_norm": 0.8405666351318359,
+ "learning_rate": 4.536676245570111e-06,
+ "loss": 0.1118,
+ "step": 215
+ },
+ {
+ "epoch": 1.8947368421052633,
+ "grad_norm": 0.42860639095306396,
+ "learning_rate": 4.528847647518403e-06,
+ "loss": 0.1093,
+ "step": 216
+ },
+ {
+ "epoch": 1.9035087719298245,
+ "grad_norm": 1.1538206338882446,
+ "learning_rate": 4.520960338066271e-06,
+ "loss": 0.1088,
+ "step": 217
+ },
+ {
+ "epoch": 1.912280701754386,
+ "grad_norm": 0.5870749354362488,
+ "learning_rate": 4.513014545459038e-06,
+ "loss": 0.1061,
+ "step": 218
+ },
+ {
+ "epoch": 1.9210526315789473,
+ "grad_norm": 0.7279748916625977,
+ "learning_rate": 4.505010499634427e-06,
+ "loss": 0.1032,
+ "step": 219
+ },
+ {
+ "epoch": 1.9298245614035088,
+ "grad_norm": 0.6331414580345154,
+ "learning_rate": 4.4969484322159125e-06,
+ "loss": 0.1109,
+ "step": 220
+ },
+ {
+ "epoch": 1.9385964912280702,
+ "grad_norm": 0.9024543166160583,
+ "learning_rate": 4.488828576506014e-06,
+ "loss": 0.1094,
+ "step": 221
+ },
+ {
+ "epoch": 1.9473684210526314,
+ "grad_norm": 3.540376901626587,
+ "learning_rate": 4.480651167479545e-06,
+ "loss": 0.1154,
+ "step": 222
+ },
+ {
+ "epoch": 1.956140350877193,
+ "grad_norm": 0.9506739377975464,
+ "learning_rate": 4.472416441776817e-06,
+ "loss": 0.108,
+ "step": 223
+ },
+ {
+ "epoch": 1.9649122807017543,
+ "grad_norm": 0.6585081815719604,
+ "learning_rate": 4.464124637696786e-06,
+ "loss": 0.1033,
+ "step": 224
+ },
+ {
+ "epoch": 1.973684210526316,
+ "grad_norm": 1.143038034439087,
+ "learning_rate": 4.455775995190161e-06,
+ "loss": 0.1092,
+ "step": 225
+ },
+ {
+ "epoch": 1.9824561403508771,
+ "grad_norm": 1.148261547088623,
+ "learning_rate": 4.4473707558524555e-06,
+ "loss": 0.1076,
+ "step": 226
+ },
+ {
+ "epoch": 1.9912280701754386,
+ "grad_norm": 0.7375811338424683,
+ "learning_rate": 4.438909162917003e-06,
+ "loss": 0.108,
+ "step": 227
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.5254591703414917,
+ "learning_rate": 4.430391461247911e-06,
+ "loss": 0.1079,
+ "step": 228
+ },
+ {
+ "epoch": 2.008771929824561,
+ "grad_norm": 1.0198495388031006,
+ "learning_rate": 4.42181789733298e-06,
+ "loss": 0.1083,
+ "step": 229
+ },
+ {
+ "epoch": 2.017543859649123,
+ "grad_norm": 0.9234157800674438,
+ "learning_rate": 4.413188719276569e-06,
+ "loss": 0.1084,
+ "step": 230
+ },
+ {
+ "epoch": 2.026315789473684,
+ "grad_norm": 0.5215068459510803,
+ "learning_rate": 4.404504176792414e-06,
+ "loss": 0.1067,
+ "step": 231
+ },
+ {
+ "epoch": 2.0350877192982457,
+ "grad_norm": 0.9296736121177673,
+ "learning_rate": 4.3957645211964065e-06,
+ "loss": 0.1066,
+ "step": 232
+ },
+ {
+ "epoch": 2.043859649122807,
+ "grad_norm": 0.8660671710968018,
+ "learning_rate": 4.386970005399314e-06,
+ "loss": 0.108,
+ "step": 233
+ },
+ {
+ "epoch": 2.0526315789473686,
+ "grad_norm": 0.6014883518218994,
+ "learning_rate": 4.378120883899467e-06,
+ "loss": 0.1068,
+ "step": 234
+ },
+ {
+ "epoch": 2.06140350877193,
+ "grad_norm": 0.6370371580123901,
+ "learning_rate": 4.369217412775393e-06,
+ "loss": 0.1076,
+ "step": 235
+ },
+ {
+ "epoch": 2.0701754385964914,
+ "grad_norm": 0.9806828498840332,
+ "learning_rate": 4.360259849678402e-06,
+ "loss": 0.1071,
+ "step": 236
+ },
+ {
+ "epoch": 2.0789473684210527,
+ "grad_norm": 0.6093440651893616,
+ "learning_rate": 4.351248453825137e-06,
+ "loss": 0.1038,
+ "step": 237
+ },
+ {
+ "epoch": 2.087719298245614,
+ "grad_norm": 1.3494842052459717,
+ "learning_rate": 4.3421834859900695e-06,
+ "loss": 0.1105,
+ "step": 238
+ },
+ {
+ "epoch": 2.0964912280701755,
+ "grad_norm": 0.7621576189994812,
+ "learning_rate": 4.333065208497949e-06,
+ "loss": 0.1048,
+ "step": 239
+ },
+ {
+ "epoch": 2.1052631578947367,
+ "grad_norm": 0.5918282866477966,
+ "learning_rate": 4.3238938852162195e-06,
+ "loss": 0.1086,
+ "step": 240
+ },
+ {
+ "epoch": 2.1140350877192984,
+ "grad_norm": 0.7048676609992981,
+ "learning_rate": 4.314669781547379e-06,
+ "loss": 0.1061,
+ "step": 241
+ },
+ {
+ "epoch": 2.1228070175438596,
+ "grad_norm": 1.0750821828842163,
+ "learning_rate": 4.305393164421301e-06,
+ "loss": 0.1082,
+ "step": 242
+ },
+ {
+ "epoch": 2.1315789473684212,
+ "grad_norm": 0.6171414852142334,
+ "learning_rate": 4.296064302287507e-06,
+ "loss": 0.1039,
+ "step": 243
+ },
+ {
+ "epoch": 2.1403508771929824,
+ "grad_norm": 0.8080905079841614,
+ "learning_rate": 4.286683465107403e-06,
+ "loss": 0.1069,
+ "step": 244
+ },
+ {
+ "epoch": 2.1491228070175437,
+ "grad_norm": 0.5281466245651245,
+ "learning_rate": 4.277250924346461e-06,
+ "loss": 0.1069,
+ "step": 245
+ },
+ {
+ "epoch": 2.1578947368421053,
+ "grad_norm": 0.8070254325866699,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.1061,
+ "step": 246
+ },
+ {
+ "epoch": 2.1666666666666665,
+ "grad_norm": 0.8560577630996704,
+ "learning_rate": 4.25823182541713e-06,
+ "loss": 0.1116,
+ "step": 247
+ },
+ {
+ "epoch": 2.175438596491228,
+ "grad_norm": 0.7772330045700073,
+ "learning_rate": 4.2486458176291176e-06,
+ "loss": 0.1092,
+ "step": 248
+ },
+ {
+ "epoch": 2.1842105263157894,
+ "grad_norm": 0.814601719379425,
+ "learning_rate": 4.239009207005096e-06,
+ "loss": 0.1093,
+ "step": 249
+ },
+ {
+ "epoch": 2.192982456140351,
+ "grad_norm": 0.957789957523346,
+ "learning_rate": 4.2293222724121855e-06,
+ "loss": 0.1075,
+ "step": 250
+ },
+ {
+ "epoch": 2.2017543859649122,
+ "grad_norm": 0.500062108039856,
+ "learning_rate": 4.219585294173799e-06,
+ "loss": 0.1048,
+ "step": 251
+ },
+ {
+ "epoch": 2.2105263157894735,
+ "grad_norm": 0.3866419792175293,
+ "learning_rate": 4.209798554061527e-06,
+ "loss": 0.1074,
+ "step": 252
+ },
+ {
+ "epoch": 2.219298245614035,
+ "grad_norm": 1.1853291988372803,
+ "learning_rate": 4.199962335286985e-06,
+ "loss": 0.1076,
+ "step": 253
+ },
+ {
+ "epoch": 2.2280701754385963,
+ "grad_norm": 0.36602887511253357,
+ "learning_rate": 4.1900769224936125e-06,
+ "loss": 0.108,
+ "step": 254
+ },
+ {
+ "epoch": 2.236842105263158,
+ "grad_norm": 0.2530711889266968,
+ "learning_rate": 4.180142601748447e-06,
+ "loss": 0.1041,
+ "step": 255
+ },
+ {
+ "epoch": 2.245614035087719,
+ "grad_norm": 1.3067054748535156,
+ "learning_rate": 4.170159660533834e-06,
+ "loss": 0.1087,
+ "step": 256
+ },
+ {
+ "epoch": 2.254385964912281,
+ "grad_norm": 0.3442043960094452,
+ "learning_rate": 4.160128387739114e-06,
+ "loss": 0.1099,
+ "step": 257
+ },
+ {
+ "epoch": 2.263157894736842,
+ "grad_norm": 1.174796462059021,
+ "learning_rate": 4.150049073652262e-06,
+ "loss": 0.1063,
+ "step": 258
+ },
+ {
+ "epoch": 2.2719298245614037,
+ "grad_norm": 0.5719411969184875,
+ "learning_rate": 4.1399220099514845e-06,
+ "loss": 0.1043,
+ "step": 259
+ },
+ {
+ "epoch": 2.280701754385965,
+ "grad_norm": 0.7268956303596497,
+ "learning_rate": 4.129747489696781e-06,
+ "loss": 0.1038,
+ "step": 260
+ },
+ {
+ "epoch": 2.2894736842105265,
+ "grad_norm": 0.7028316259384155,
+ "learning_rate": 4.119525807321467e-06,
+ "loss": 0.1052,
+ "step": 261
+ },
+ {
+ "epoch": 2.2982456140350878,
+ "grad_norm": 1.015335202217102,
+ "learning_rate": 4.109257258623644e-06,
+ "loss": 0.1116,
+ "step": 262
+ },
+ {
+ "epoch": 2.307017543859649,
+ "grad_norm": 0.7141755819320679,
+ "learning_rate": 4.098942140757646e-06,
+ "loss": 0.108,
+ "step": 263
+ },
+ {
+ "epoch": 2.3157894736842106,
+ "grad_norm": 0.7656403183937073,
+ "learning_rate": 4.0885807522254435e-06,
+ "loss": 0.1043,
+ "step": 264
+ },
+ {
+ "epoch": 2.324561403508772,
+ "grad_norm": 0.43293774127960205,
+ "learning_rate": 4.078173392867998e-06,
+ "loss": 0.1048,
+ "step": 265
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 0.6755763292312622,
+ "learning_rate": 4.0677203638565895e-06,
+ "loss": 0.1064,
+ "step": 266
+ },
+ {
+ "epoch": 2.3421052631578947,
+ "grad_norm": 0.9648827314376831,
+ "learning_rate": 4.0572219676841e-06,
+ "loss": 0.1088,
+ "step": 267
+ },
+ {
+ "epoch": 2.3508771929824563,
+ "grad_norm": 0.32724836468696594,
+ "learning_rate": 4.046678508156259e-06,
+ "loss": 0.1077,
+ "step": 268
+ },
+ {
+ "epoch": 2.3596491228070176,
+ "grad_norm": 0.4696657061576843,
+ "learning_rate": 4.036090290382855e-06,
+ "loss": 0.1067,
+ "step": 269
+ },
+ {
+ "epoch": 2.3684210526315788,
+ "grad_norm": 0.33901306986808777,
+ "learning_rate": 4.025457620768901e-06,
+ "loss": 0.105,
+ "step": 270
+ },
+ {
+ "epoch": 2.3771929824561404,
+ "grad_norm": 0.5703794360160828,
+ "learning_rate": 4.014780807005775e-06,
+ "loss": 0.1033,
+ "step": 271
+ },
+ {
+ "epoch": 2.3859649122807016,
+ "grad_norm": 0.9639355540275574,
+ "learning_rate": 4.004060158062306e-06,
+ "loss": 0.1041,
+ "step": 272
+ },
+ {
+ "epoch": 2.3947368421052633,
+ "grad_norm": 0.8851558566093445,
+ "learning_rate": 3.993295984175845e-06,
+ "loss": 0.1064,
+ "step": 273
+ },
+ {
+ "epoch": 2.4035087719298245,
+ "grad_norm": 0.5200062990188599,
+ "learning_rate": 3.982488596843276e-06,
+ "loss": 0.1056,
+ "step": 274
+ },
+ {
+ "epoch": 2.412280701754386,
+ "grad_norm": 1.160823106765747,
+ "learning_rate": 3.971638308812007e-06,
+ "loss": 0.1069,
+ "step": 275
+ },
+ {
+ "epoch": 2.4210526315789473,
+ "grad_norm": 1.0191210508346558,
+ "learning_rate": 3.9607454340709215e-06,
+ "loss": 0.1042,
+ "step": 276
+ },
+ {
+ "epoch": 2.4298245614035086,
+ "grad_norm": 0.37181487679481506,
+ "learning_rate": 3.949810287841289e-06,
+ "loss": 0.1062,
+ "step": 277
+ },
+ {
+ "epoch": 2.43859649122807,
+ "grad_norm": 0.9328593611717224,
+ "learning_rate": 3.9388331865676436e-06,
+ "loss": 0.1086,
+ "step": 278
+ },
+ {
+ "epoch": 2.4473684210526314,
+ "grad_norm": 0.8024734258651733,
+ "learning_rate": 3.927814447908625e-06,
+ "loss": 0.1051,
+ "step": 279
+ },
+ {
+ "epoch": 2.456140350877193,
+ "grad_norm": 0.9746696352958679,
+ "learning_rate": 3.916754390727795e-06,
+ "loss": 0.1041,
+ "step": 280
+ },
+ {
+ "epoch": 2.4649122807017543,
+ "grad_norm": 0.5457844138145447,
+ "learning_rate": 3.905653335084394e-06,
+ "loss": 0.1052,
+ "step": 281
+ },
+ {
+ "epoch": 2.473684210526316,
+ "grad_norm": 1.0736924409866333,
+ "learning_rate": 3.8945116022240945e-06,
+ "loss": 0.1075,
+ "step": 282
+ },
+ {
+ "epoch": 2.482456140350877,
+ "grad_norm": 0.6335628032684326,
+ "learning_rate": 3.8833295145696964e-06,
+ "loss": 0.1036,
+ "step": 283
+ },
+ {
+ "epoch": 2.4912280701754383,
+ "grad_norm": 0.6909618377685547,
+ "learning_rate": 3.872107395711799e-06,
+ "loss": 0.1089,
+ "step": 284
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 2.1871702671051025,
+ "learning_rate": 3.860845570399435e-06,
+ "loss": 0.1066,
+ "step": 285
+ },
+ {
+ "epoch": 2.5087719298245617,
+ "grad_norm": 0.5831722617149353,
+ "learning_rate": 3.849544364530678e-06,
+ "loss": 0.1055,
+ "step": 286
+ },
+ {
+ "epoch": 2.517543859649123,
+ "grad_norm": 0.5302637815475464,
+ "learning_rate": 3.838204105143204e-06,
+ "loss": 0.1057,
+ "step": 287
+ },
+ {
+ "epoch": 2.526315789473684,
+ "grad_norm": 0.6348035931587219,
+ "learning_rate": 3.8268251204048335e-06,
+ "loss": 0.1089,
+ "step": 288
+ },
+ {
+ "epoch": 2.5350877192982457,
+ "grad_norm": 2.1932008266448975,
+ "learning_rate": 3.815407739604033e-06,
+ "loss": 0.1043,
+ "step": 289
+ },
+ {
+ "epoch": 2.543859649122807,
+ "grad_norm": 0.4388940930366516,
+ "learning_rate": 3.803952293140385e-06,
+ "loss": 0.1055,
+ "step": 290
+ },
+ {
+ "epoch": 2.5526315789473686,
+ "grad_norm": 0.6853339076042175,
+ "learning_rate": 3.7924591125150265e-06,
+ "loss": 0.1036,
+ "step": 291
+ },
+ {
+ "epoch": 2.56140350877193,
+ "grad_norm": 0.34744876623153687,
+ "learning_rate": 3.78092853032106e-06,
+ "loss": 0.1025,
+ "step": 292
+ },
+ {
+ "epoch": 2.5701754385964914,
+ "grad_norm": 0.9523847699165344,
+ "learning_rate": 3.769360880233922e-06,
+ "loss": 0.1067,
+ "step": 293
+ },
+ {
+ "epoch": 2.5789473684210527,
+ "grad_norm": 1.303745985031128,
+ "learning_rate": 3.7577564970017338e-06,
+ "loss": 0.1082,
+ "step": 294
+ },
+ {
+ "epoch": 2.587719298245614,
+ "grad_norm": 0.9468981623649597,
+ "learning_rate": 3.7461157164356103e-06,
+ "loss": 0.1055,
+ "step": 295
+ },
+ {
+ "epoch": 2.5964912280701755,
+ "grad_norm": 0.7204175591468811,
+ "learning_rate": 3.7344388753999434e-06,
+ "loss": 0.1055,
+ "step": 296
+ },
+ {
+ "epoch": 2.6052631578947367,
+ "grad_norm": 0.5110165476799011,
+ "learning_rate": 3.7227263118026537e-06,
+ "loss": 0.1092,
+ "step": 297
+ },
+ {
+ "epoch": 2.6140350877192984,
+ "grad_norm": 0.6483246088027954,
+ "learning_rate": 3.7109783645854116e-06,
+ "loss": 0.1078,
+ "step": 298
+ },
+ {
+ "epoch": 2.6228070175438596,
+ "grad_norm": 0.5058422684669495,
+ "learning_rate": 3.699195373713831e-06,
+ "loss": 0.1073,
+ "step": 299
+ },
+ {
+ "epoch": 2.6315789473684212,
+ "grad_norm": 0.4123518764972687,
+ "learning_rate": 3.6873776801676265e-06,
+ "loss": 0.1053,
+ "step": 300
+ },
+ {
+ "epoch": 2.6403508771929824,
+ "grad_norm": 1.0864709615707397,
+ "learning_rate": 3.675525625930751e-06,
+ "loss": 0.1048,
+ "step": 301
+ },
+ {
+ "epoch": 2.6491228070175437,
+ "grad_norm": 1.0264904499053955,
+ "learning_rate": 3.6636395539814975e-06,
+ "loss": 0.1059,
+ "step": 302
+ },
+ {
+ "epoch": 2.6578947368421053,
+ "grad_norm": 0.7724822163581848,
+ "learning_rate": 3.651719808282573e-06,
+ "loss": 0.1063,
+ "step": 303
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 0.7474755644798279,
+ "learning_rate": 3.6397667337711475e-06,
+ "loss": 0.1034,
+ "step": 304
+ },
+ {
+ "epoch": 2.675438596491228,
+ "grad_norm": 0.5628909468650818,
+ "learning_rate": 3.6277806763488666e-06,
+ "loss": 0.1026,
+ "step": 305
+ },
+ {
+ "epoch": 2.6842105263157894,
+ "grad_norm": 0.9070547819137573,
+ "learning_rate": 3.6157619828718477e-06,
+ "loss": 0.1031,
+ "step": 306
+ },
+ {
+ "epoch": 2.692982456140351,
+ "grad_norm": 0.6968091130256653,
+ "learning_rate": 3.603711001140641e-06,
+ "loss": 0.1068,
+ "step": 307
+ },
+ {
+ "epoch": 2.7017543859649122,
+ "grad_norm": 0.3764977753162384,
+ "learning_rate": 3.5916280798901604e-06,
+ "loss": 0.1038,
+ "step": 308
+ },
+ {
+ "epoch": 2.7105263157894735,
+ "grad_norm": 5.012625694274902,
+ "learning_rate": 3.5795135687795984e-06,
+ "loss": 0.1129,
+ "step": 309
+ },
+ {
+ "epoch": 2.719298245614035,
+ "grad_norm": 0.6745572686195374,
+ "learning_rate": 3.567367818382303e-06,
+ "loss": 0.1071,
+ "step": 310
+ },
+ {
+ "epoch": 2.7280701754385968,
+ "grad_norm": 1.0659606456756592,
+ "learning_rate": 3.555191180175634e-06,
+ "loss": 0.1067,
+ "step": 311
+ },
+ {
+ "epoch": 2.736842105263158,
+ "grad_norm": 1.7312604188919067,
+ "learning_rate": 3.5429840065307924e-06,
+ "loss": 0.1101,
+ "step": 312
+ },
+ {
+ "epoch": 2.745614035087719,
+ "grad_norm": 1.100364327430725,
+ "learning_rate": 3.5307466507026223e-06,
+ "loss": 0.1098,
+ "step": 313
+ },
+ {
+ "epoch": 2.754385964912281,
+ "grad_norm": 1.0390428304672241,
+ "learning_rate": 3.5184794668193893e-06,
+ "loss": 0.1094,
+ "step": 314
+ },
+ {
+ "epoch": 2.763157894736842,
+ "grad_norm": 0.3369971811771393,
+ "learning_rate": 3.5061828098725327e-06,
+ "loss": 0.1053,
+ "step": 315
+ },
+ {
+ "epoch": 2.7719298245614032,
+ "grad_norm": 0.6130257248878479,
+ "learning_rate": 3.4938570357063906e-06,
+ "loss": 0.106,
+ "step": 316
+ },
+ {
+ "epoch": 2.780701754385965,
+ "grad_norm": 0.6387595534324646,
+ "learning_rate": 3.481502501007904e-06,
+ "loss": 0.1044,
+ "step": 317
+ },
+ {
+ "epoch": 2.7894736842105265,
+ "grad_norm": 1.0731587409973145,
+ "learning_rate": 3.469119563296296e-06,
+ "loss": 0.1097,
+ "step": 318
+ },
+ {
+ "epoch": 2.7982456140350878,
+ "grad_norm": 0.8096229434013367,
+ "learning_rate": 3.4567085809127247e-06,
+ "loss": 0.1076,
+ "step": 319
+ },
+ {
+ "epoch": 2.807017543859649,
+ "grad_norm": 0.5034844279289246,
+ "learning_rate": 3.444269913009912e-06,
+ "loss": 0.1071,
+ "step": 320
+ },
+ {
+ "epoch": 2.8157894736842106,
+ "grad_norm": 0.675139307975769,
+ "learning_rate": 3.4318039195417536e-06,
+ "loss": 0.1039,
+ "step": 321
+ },
+ {
+ "epoch": 2.824561403508772,
+ "grad_norm": 0.7330355644226074,
+ "learning_rate": 3.4193109612528972e-06,
+ "loss": 0.1044,
+ "step": 322
+ },
+ {
+ "epoch": 2.8333333333333335,
+ "grad_norm": 0.6558271646499634,
+ "learning_rate": 3.4067913996683115e-06,
+ "loss": 0.1051,
+ "step": 323
+ },
+ {
+ "epoch": 2.8421052631578947,
+ "grad_norm": 0.8411844372749329,
+ "learning_rate": 3.3942455970828146e-06,
+ "loss": 0.1063,
+ "step": 324
+ },
+ {
+ "epoch": 2.8508771929824563,
+ "grad_norm": 0.4817325174808502,
+ "learning_rate": 3.3816739165505964e-06,
+ "loss": 0.105,
+ "step": 325
+ },
+ {
+ "epoch": 2.8596491228070176,
+ "grad_norm": 0.424554705619812,
+ "learning_rate": 3.3690767218747104e-06,
+ "loss": 0.1037,
+ "step": 326
+ },
+ {
+ "epoch": 2.8684210526315788,
+ "grad_norm": 1.0054417848587036,
+ "learning_rate": 3.3564543775965475e-06,
+ "loss": 0.1058,
+ "step": 327
+ },
+ {
+ "epoch": 2.8771929824561404,
+ "grad_norm": 0.8984584808349609,
+ "learning_rate": 3.3438072489852837e-06,
+ "loss": 0.1079,
+ "step": 328
+ },
+ {
+ "epoch": 2.8859649122807016,
+ "grad_norm": 0.6779558062553406,
+ "learning_rate": 3.331135702027311e-06,
+ "loss": 0.1046,
+ "step": 329
+ },
+ {
+ "epoch": 2.8947368421052633,
+ "grad_norm": 0.6931657195091248,
+ "learning_rate": 3.318440103415649e-06,
+ "loss": 0.1106,
+ "step": 330
+ },
+ {
+ "epoch": 2.9035087719298245,
+ "grad_norm": 0.705264151096344,
+ "learning_rate": 3.305720820539329e-06,
+ "loss": 0.104,
+ "step": 331
+ },
+ {
+ "epoch": 2.912280701754386,
+ "grad_norm": 0.7799407839775085,
+ "learning_rate": 3.2929782214727657e-06,
+ "loss": 0.1019,
+ "step": 332
+ },
+ {
+ "epoch": 2.9210526315789473,
+ "grad_norm": 0.7583760619163513,
+ "learning_rate": 3.2802126749651042e-06,
+ "loss": 0.1049,
+ "step": 333
+ },
+ {
+ "epoch": 2.9298245614035086,
+ "grad_norm": 0.6145837306976318,
+ "learning_rate": 3.2674245504295505e-06,
+ "loss": 0.104,
+ "step": 334
+ },
+ {
+ "epoch": 2.93859649122807,
+ "grad_norm": 0.5170779228210449,
+ "learning_rate": 3.254614217932679e-06,
+ "loss": 0.1024,
+ "step": 335
+ },
+ {
+ "epoch": 2.9473684210526314,
+ "grad_norm": 0.6850940585136414,
+ "learning_rate": 3.241782048183726e-06,
+ "loss": 0.1047,
+ "step": 336
+ },
+ {
+ "epoch": 2.956140350877193,
+ "grad_norm": 0.7307694554328918,
+ "learning_rate": 3.2289284125238597e-06,
+ "loss": 0.1032,
+ "step": 337
+ },
+ {
+ "epoch": 2.9649122807017543,
+ "grad_norm": 0.3386179208755493,
+ "learning_rate": 3.216053682915436e-06,
+ "loss": 0.1037,
+ "step": 338
+ },
+ {
+ "epoch": 2.973684210526316,
+ "grad_norm": 0.7565059065818787,
+ "learning_rate": 3.203158231931234e-06,
+ "loss": 0.1048,
+ "step": 339
+ },
+ {
+ "epoch": 2.982456140350877,
+ "grad_norm": 0.7902039289474487,
+ "learning_rate": 3.190242432743673e-06,
+ "loss": 0.1068,
+ "step": 340
+ },
+ {
+ "epoch": 2.9912280701754383,
+ "grad_norm": 0.42595192790031433,
+ "learning_rate": 3.177306659114015e-06,
+ "loss": 0.1039,
+ "step": 341
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.1214542388916016,
+ "learning_rate": 3.164351285381549e-06,
+ "loss": 0.1062,
+ "step": 342
+ },
+ {
+ "epoch": 3.008771929824561,
+ "grad_norm": 0.7622955441474915,
+ "learning_rate": 3.1513766864527577e-06,
+ "loss": 0.1015,
+ "step": 343
+ },
+ {
+ "epoch": 3.017543859649123,
+ "grad_norm": 0.2676297724246979,
+ "learning_rate": 3.1383832377904676e-06,
+ "loss": 0.1037,
+ "step": 344
+ },
+ {
+ "epoch": 3.026315789473684,
+ "grad_norm": 0.8695605397224426,
+ "learning_rate": 3.1253713154029857e-06,
+ "loss": 0.1056,
+ "step": 345
+ },
+ {
+ "epoch": 3.0350877192982457,
+ "grad_norm": 0.5875906944274902,
+ "learning_rate": 3.1123412958332155e-06,
+ "loss": 0.1067,
+ "step": 346
+ },
+ {
+ "epoch": 3.043859649122807,
+ "grad_norm": 0.7699372172355652,
+ "learning_rate": 3.0992935561477632e-06,
+ "loss": 0.1035,
+ "step": 347
+ },
+ {
+ "epoch": 3.0526315789473686,
+ "grad_norm": 0.5919204354286194,
+ "learning_rate": 3.0862284739260247e-06,
+ "loss": 0.1023,
+ "step": 348
+ },
+ {
+ "epoch": 3.06140350877193,
+ "grad_norm": 1.3211849927902222,
+ "learning_rate": 3.07314642724926e-06,
+ "loss": 0.1065,
+ "step": 349
+ },
+ {
+ "epoch": 3.0701754385964914,
+ "grad_norm": 0.6359637379646301,
+ "learning_rate": 3.0600477946896494e-06,
+ "loss": 0.106,
+ "step": 350
+ },
+ {
+ "epoch": 3.0789473684210527,
+ "grad_norm": 0.35776662826538086,
+ "learning_rate": 3.046932955299344e-06,
+ "loss": 0.1046,
+ "step": 351
+ },
+ {
+ "epoch": 3.087719298245614,
+ "grad_norm": 0.6657406687736511,
+ "learning_rate": 3.0338022885994904e-06,
+ "loss": 0.1076,
+ "step": 352
+ },
+ {
+ "epoch": 3.0964912280701755,
+ "grad_norm": 0.7587785720825195,
+ "learning_rate": 3.0206561745692512e-06,
+ "loss": 0.1043,
+ "step": 353
+ },
+ {
+ "epoch": 3.1052631578947367,
+ "grad_norm": 1.1258317232131958,
+ "learning_rate": 3.0074949936348084e-06,
+ "loss": 0.1043,
+ "step": 354
+ },
+ {
+ "epoch": 3.1140350877192984,
+ "grad_norm": 0.3570568263530731,
+ "learning_rate": 2.9943191266583564e-06,
+ "loss": 0.1032,
+ "step": 355
+ },
+ {
+ "epoch": 3.1228070175438596,
+ "grad_norm": 0.843485414981842,
+ "learning_rate": 2.981128954927075e-06,
+ "loss": 0.1045,
+ "step": 356
+ },
+ {
+ "epoch": 3.1315789473684212,
+ "grad_norm": 0.5719651579856873,
+ "learning_rate": 2.967924860142103e-06,
+ "loss": 0.1052,
+ "step": 357
+ },
+ {
+ "epoch": 3.1403508771929824,
+ "grad_norm": 2.20767879486084,
+ "learning_rate": 2.9547072244074853e-06,
+ "loss": 0.1078,
+ "step": 358
+ },
+ {
+ "epoch": 3.1491228070175437,
+ "grad_norm": 0.3715457022190094,
+ "learning_rate": 2.941476430219122e-06,
+ "loss": 0.1047,
+ "step": 359
+ },
+ {
+ "epoch": 3.1578947368421053,
+ "grad_norm": 0.7803200483322144,
+ "learning_rate": 2.928232860453694e-06,
+ "loss": 0.1029,
+ "step": 360
+ },
+ {
+ "epoch": 3.1666666666666665,
+ "grad_norm": 0.5198164582252502,
+ "learning_rate": 2.9149768983575884e-06,
+ "loss": 0.1032,
+ "step": 361
+ },
+ {
+ "epoch": 3.175438596491228,
+ "grad_norm": 0.7827185988426208,
+ "learning_rate": 2.9017089275358017e-06,
+ "loss": 0.1043,
+ "step": 362
+ },
+ {
+ "epoch": 3.1842105263157894,
+ "grad_norm": 0.4000351130962372,
+ "learning_rate": 2.8884293319408464e-06,
+ "loss": 0.1071,
+ "step": 363
+ },
+ {
+ "epoch": 3.192982456140351,
+ "grad_norm": 0.9913386106491089,
+ "learning_rate": 2.8751384958616318e-06,
+ "loss": 0.1022,
+ "step": 364
+ },
+ {
+ "epoch": 3.2017543859649122,
+ "grad_norm": 0.6975695490837097,
+ "learning_rate": 2.861836803912353e-06,
+ "loss": 0.1029,
+ "step": 365
+ },
+ {
+ "epoch": 3.2105263157894735,
+ "grad_norm": 0.2372695654630661,
+ "learning_rate": 2.8485246410213497e-06,
+ "loss": 0.1015,
+ "step": 366
+ },
+ {
+ "epoch": 3.219298245614035,
+ "grad_norm": 0.447732537984848,
+ "learning_rate": 2.835202392419977e-06,
+ "loss": 0.1052,
+ "step": 367
+ },
+ {
+ "epoch": 3.2280701754385963,
+ "grad_norm": 0.6617346405982971,
+ "learning_rate": 2.8218704436314525e-06,
+ "loss": 0.1055,
+ "step": 368
+ },
+ {
+ "epoch": 3.236842105263158,
+ "grad_norm": 0.5550402402877808,
+ "learning_rate": 2.8085291804596995e-06,
+ "loss": 0.102,
+ "step": 369
+ },
+ {
+ "epoch": 3.245614035087719,
+ "grad_norm": 0.6046020984649658,
+ "learning_rate": 2.795178988978185e-06,
+ "loss": 0.1036,
+ "step": 370
+ },
+ {
+ "epoch": 3.254385964912281,
+ "grad_norm": 0.41890618205070496,
+ "learning_rate": 2.781820255518745e-06,
+ "loss": 0.1036,
+ "step": 371
+ },
+ {
+ "epoch": 3.263157894736842,
+ "grad_norm": 0.8387415409088135,
+ "learning_rate": 2.768453366660408e-06,
+ "loss": 0.1076,
+ "step": 372
+ },
+ {
+ "epoch": 3.2719298245614037,
+ "grad_norm": 0.5318773984909058,
+ "learning_rate": 2.755078709218203e-06,
+ "loss": 0.1052,
+ "step": 373
+ },
+ {
+ "epoch": 3.280701754385965,
+ "grad_norm": 0.6617523431777954,
+ "learning_rate": 2.741696670231969e-06,
+ "loss": 0.1049,
+ "step": 374
+ },
+ {
+ "epoch": 3.2894736842105265,
+ "grad_norm": 1.0190025568008423,
+ "learning_rate": 2.728307636955156e-06,
+ "loss": 0.1034,
+ "step": 375
+ },
+ {
+ "epoch": 3.2982456140350878,
+ "grad_norm": 0.6924716234207153,
+ "learning_rate": 2.714911996843617e-06,
+ "loss": 0.1065,
+ "step": 376
+ },
+ {
+ "epoch": 3.307017543859649,
+ "grad_norm": 0.42501118779182434,
+ "learning_rate": 2.701510137544393e-06,
+ "loss": 0.1019,
+ "step": 377
+ },
+ {
+ "epoch": 3.3157894736842106,
+ "grad_norm": 0.844886064529419,
+ "learning_rate": 2.6881024468845e-06,
+ "loss": 0.1047,
+ "step": 378
+ },
+ {
+ "epoch": 3.324561403508772,
+ "grad_norm": 0.46512728929519653,
+ "learning_rate": 2.674689312859704e-06,
+ "loss": 0.1043,
+ "step": 379
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 0.6242017149925232,
+ "learning_rate": 2.6612711236232915e-06,
+ "loss": 0.1046,
+ "step": 380
+ },
+ {
+ "epoch": 3.3421052631578947,
+ "grad_norm": 0.6578526496887207,
+ "learning_rate": 2.6478482674748375e-06,
+ "loss": 0.1031,
+ "step": 381
+ },
+ {
+ "epoch": 3.3508771929824563,
+ "grad_norm": 0.4822542667388916,
+ "learning_rate": 2.63442113284897e-06,
+ "loss": 0.1053,
+ "step": 382
+ },
+ {
+ "epoch": 3.3596491228070176,
+ "grad_norm": 0.48255595564842224,
+ "learning_rate": 2.6209901083041307e-06,
+ "loss": 0.1058,
+ "step": 383
+ },
+ {
+ "epoch": 3.3684210526315788,
+ "grad_norm": 0.6624025702476501,
+ "learning_rate": 2.6075555825113265e-06,
+ "loss": 0.1066,
+ "step": 384
+ },
+ {
+ "epoch": 3.3771929824561404,
+ "grad_norm": 0.6962618827819824,
+ "learning_rate": 2.5941179442428864e-06,
+ "loss": 0.102,
+ "step": 385
+ },
+ {
+ "epoch": 3.3859649122807016,
+ "grad_norm": 0.4976450502872467,
+ "learning_rate": 2.580677582361208e-06,
+ "loss": 0.1011,
+ "step": 386
+ },
+ {
+ "epoch": 3.3947368421052633,
+ "grad_norm": 0.5283737182617188,
+ "learning_rate": 2.5672348858075053e-06,
+ "loss": 0.1057,
+ "step": 387
+ },
+ {
+ "epoch": 3.4035087719298245,
+ "grad_norm": 0.32338738441467285,
+ "learning_rate": 2.553790243590556e-06,
+ "loss": 0.1015,
+ "step": 388
+ },
+ {
+ "epoch": 3.412280701754386,
+ "grad_norm": 0.7909435629844666,
+ "learning_rate": 2.5403440447754385e-06,
+ "loss": 0.1036,
+ "step": 389
+ },
+ {
+ "epoch": 3.4210526315789473,
+ "grad_norm": 0.6297115087509155,
+ "learning_rate": 2.5268966784722792e-06,
+ "loss": 0.1042,
+ "step": 390
+ },
+ {
+ "epoch": 3.4298245614035086,
+ "grad_norm": 0.32988762855529785,
+ "learning_rate": 2.513448533824988e-06,
+ "loss": 0.1059,
+ "step": 391
+ },
+ {
+ "epoch": 3.43859649122807,
+ "grad_norm": 0.9211220145225525,
+ "learning_rate": 2.5e-06,
+ "loss": 0.1015,
+ "step": 392
+ },
+ {
+ "epoch": 3.4473684210526314,
+ "grad_norm": 1.2157588005065918,
+ "learning_rate": 2.486551466175013e-06,
+ "loss": 0.1035,
+ "step": 393
+ },
+ {
+ "epoch": 3.456140350877193,
+ "grad_norm": 0.4786648452281952,
+ "learning_rate": 2.4731033215277216e-06,
+ "loss": 0.1026,
+ "step": 394
+ },
+ {
+ "epoch": 3.4649122807017543,
+ "grad_norm": 0.37398242950439453,
+ "learning_rate": 2.4596559552245623e-06,
+ "loss": 0.1044,
+ "step": 395
+ },
+ {
+ "epoch": 3.473684210526316,
+ "grad_norm": 0.5536217093467712,
+ "learning_rate": 2.446209756409445e-06,
+ "loss": 0.1043,
+ "step": 396
+ },
+ {
+ "epoch": 3.482456140350877,
+ "grad_norm": 0.708406925201416,
+ "learning_rate": 2.432765114192495e-06,
+ "loss": 0.1046,
+ "step": 397
+ },
+ {
+ "epoch": 3.4912280701754383,
+ "grad_norm": 0.7140893340110779,
+ "learning_rate": 2.4193224176387926e-06,
+ "loss": 0.1039,
+ "step": 398
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.8078088760375977,
+ "learning_rate": 2.4058820557571144e-06,
+ "loss": 0.1013,
+ "step": 399
+ },
+ {
+ "epoch": 3.5087719298245617,
+ "grad_norm": 0.7129591107368469,
+ "learning_rate": 2.3924444174886735e-06,
+ "loss": 0.1057,
+ "step": 400
+ },
+ {
+ "epoch": 3.517543859649123,
+ "grad_norm": 1.293412446975708,
+ "learning_rate": 2.37900989169587e-06,
+ "loss": 0.1081,
+ "step": 401
+ },
+ {
+ "epoch": 3.526315789473684,
+ "grad_norm": 0.7235314249992371,
+ "learning_rate": 2.3655788671510314e-06,
+ "loss": 0.1054,
+ "step": 402
+ },
+ {
+ "epoch": 3.5350877192982457,
+ "grad_norm": 0.6008841395378113,
+ "learning_rate": 2.3521517325251637e-06,
+ "loss": 0.1033,
+ "step": 403
+ },
+ {
+ "epoch": 3.543859649122807,
+ "grad_norm": 0.6819609999656677,
+ "learning_rate": 2.3387288763767097e-06,
+ "loss": 0.1019,
+ "step": 404
+ },
+ {
+ "epoch": 3.5526315789473686,
+ "grad_norm": 0.5696406960487366,
+ "learning_rate": 2.325310687140296e-06,
+ "loss": 0.1043,
+ "step": 405
+ },
+ {
+ "epoch": 3.56140350877193,
+ "grad_norm": 0.8597077131271362,
+ "learning_rate": 2.3118975531155003e-06,
+ "loss": 0.1037,
+ "step": 406
+ },
+ {
+ "epoch": 3.5701754385964914,
+ "grad_norm": 0.43985217809677124,
+ "learning_rate": 2.2984898624556075e-06,
+ "loss": 0.105,
+ "step": 407
+ },
+ {
+ "epoch": 3.5789473684210527,
+ "grad_norm": 0.5448469519615173,
+ "learning_rate": 2.2850880031563845e-06,
+ "loss": 0.1037,
+ "step": 408
+ },
+ {
+ "epoch": 3.587719298245614,
+ "grad_norm": 0.8221977949142456,
+ "learning_rate": 2.271692363044845e-06,
+ "loss": 0.1015,
+ "step": 409
+ },
+ {
+ "epoch": 3.5964912280701755,
+ "grad_norm": 0.9838594198226929,
+ "learning_rate": 2.2583033297680316e-06,
+ "loss": 0.1085,
+ "step": 410
+ },
+ {
+ "epoch": 3.6052631578947367,
+ "grad_norm": 1.034848928451538,
+ "learning_rate": 2.2449212907817985e-06,
+ "loss": 0.104,
+ "step": 411
+ },
+ {
+ "epoch": 3.6140350877192984,
+ "grad_norm": 1.0788371562957764,
+ "learning_rate": 2.2315466333395927e-06,
+ "loss": 0.1033,
+ "step": 412
+ },
+ {
+ "epoch": 3.6228070175438596,
+ "grad_norm": 0.49096915125846863,
+ "learning_rate": 2.2181797444812557e-06,
+ "loss": 0.1044,
+ "step": 413
+ },
+ {
+ "epoch": 3.6315789473684212,
+ "grad_norm": 1.309685230255127,
+ "learning_rate": 2.204821011021815e-06,
+ "loss": 0.1036,
+ "step": 414
+ },
+ {
+ "epoch": 3.6403508771929824,
+ "grad_norm": 0.5014146566390991,
+ "learning_rate": 2.191470819540301e-06,
+ "loss": 0.104,
+ "step": 415
+ },
+ {
+ "epoch": 3.6491228070175437,
+ "grad_norm": 0.770470380783081,
+ "learning_rate": 2.178129556368548e-06,
+ "loss": 0.1049,
+ "step": 416
+ },
+ {
+ "epoch": 3.6578947368421053,
+ "grad_norm": 0.4639376699924469,
+ "learning_rate": 2.1647976075800235e-06,
+ "loss": 0.1047,
+ "step": 417
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 1.101885437965393,
+ "learning_rate": 2.151475358978652e-06,
+ "loss": 0.1035,
+ "step": 418
+ },
+ {
+ "epoch": 3.675438596491228,
+ "grad_norm": 0.5644329786300659,
+ "learning_rate": 2.138163196087648e-06,
+ "loss": 0.103,
+ "step": 419
+ },
+ {
+ "epoch": 3.6842105263157894,
+ "grad_norm": 1.1015008687973022,
+ "learning_rate": 2.1248615041383686e-06,
+ "loss": 0.1054,
+ "step": 420
+ },
+ {
+ "epoch": 3.692982456140351,
+ "grad_norm": 0.7311366200447083,
+ "learning_rate": 2.111570668059155e-06,
+ "loss": 0.1043,
+ "step": 421
+ },
+ {
+ "epoch": 3.7017543859649122,
+ "grad_norm": 0.38242173194885254,
+ "learning_rate": 2.098291072464199e-06,
+ "loss": 0.1041,
+ "step": 422
+ },
+ {
+ "epoch": 3.7105263157894735,
+ "grad_norm": 1.231512188911438,
+ "learning_rate": 2.085023101642412e-06,
+ "loss": 0.1021,
+ "step": 423
+ },
+ {
+ "epoch": 3.719298245614035,
+ "grad_norm": 0.41761213541030884,
+ "learning_rate": 2.0717671395463063e-06,
+ "loss": 0.1062,
+ "step": 424
+ },
+ {
+ "epoch": 3.7280701754385968,
+ "grad_norm": 0.4593309462070465,
+ "learning_rate": 2.0585235697808794e-06,
+ "loss": 0.1012,
+ "step": 425
+ },
+ {
+ "epoch": 3.736842105263158,
+ "grad_norm": 0.9147135019302368,
+ "learning_rate": 2.0452927755925155e-06,
+ "loss": 0.1046,
+ "step": 426
+ },
+ {
+ "epoch": 3.745614035087719,
+ "grad_norm": 0.39639535546302795,
+ "learning_rate": 2.0320751398578984e-06,
+ "loss": 0.1018,
+ "step": 427
+ },
+ {
+ "epoch": 3.754385964912281,
+ "grad_norm": 0.688010573387146,
+ "learning_rate": 2.0188710450729255e-06,
+ "loss": 0.104,
+ "step": 428
+ },
+ {
+ "epoch": 3.763157894736842,
+ "grad_norm": 0.5140353441238403,
+ "learning_rate": 2.005680873341644e-06,
+ "loss": 0.1033,
+ "step": 429
+ },
+ {
+ "epoch": 3.7719298245614032,
+ "grad_norm": 0.5970481634140015,
+ "learning_rate": 1.992505006365191e-06,
+ "loss": 0.1044,
+ "step": 430
+ },
+ {
+ "epoch": 3.780701754385965,
+ "grad_norm": 0.551162838935852,
+ "learning_rate": 1.9793438254307496e-06,
+ "loss": 0.1042,
+ "step": 431
+ },
+ {
+ "epoch": 3.7894736842105265,
+ "grad_norm": 0.5344637632369995,
+ "learning_rate": 1.96619771140051e-06,
+ "loss": 0.1042,
+ "step": 432
+ },
+ {
+ "epoch": 3.7982456140350878,
+ "grad_norm": 0.5357667207717896,
+ "learning_rate": 1.9530670447006566e-06,
+ "loss": 0.101,
+ "step": 433
+ },
+ {
+ "epoch": 3.807017543859649,
+ "grad_norm": 1.2536660432815552,
+ "learning_rate": 1.9399522053103514e-06,
+ "loss": 0.1008,
+ "step": 434
+ },
+ {
+ "epoch": 3.8157894736842106,
+ "grad_norm": 0.4888289272785187,
+ "learning_rate": 1.926853572750741e-06,
+ "loss": 0.1028,
+ "step": 435
+ },
+ {
+ "epoch": 3.824561403508772,
+ "grad_norm": 0.5810404419898987,
+ "learning_rate": 1.913771526073976e-06,
+ "loss": 0.1031,
+ "step": 436
+ },
+ {
+ "epoch": 3.8333333333333335,
+ "grad_norm": 0.5372979044914246,
+ "learning_rate": 1.9007064438522374e-06,
+ "loss": 0.107,
+ "step": 437
+ },
+ {
+ "epoch": 3.8421052631578947,
+ "grad_norm": 0.8293616771697998,
+ "learning_rate": 1.8876587041667855e-06,
+ "loss": 0.1033,
+ "step": 438
+ },
+ {
+ "epoch": 3.8508771929824563,
+ "grad_norm": 2.361504554748535,
+ "learning_rate": 1.8746286845970145e-06,
+ "loss": 0.1098,
+ "step": 439
+ },
+ {
+ "epoch": 3.8596491228070176,
+ "grad_norm": 0.70230633020401,
+ "learning_rate": 1.8616167622095328e-06,
+ "loss": 0.1034,
+ "step": 440
+ },
+ {
+ "epoch": 3.8684210526315788,
+ "grad_norm": 0.6323564052581787,
+ "learning_rate": 1.8486233135472436e-06,
+ "loss": 0.1058,
+ "step": 441
+ },
+ {
+ "epoch": 3.8771929824561404,
+ "grad_norm": 0.48205408453941345,
+ "learning_rate": 1.8356487146184517e-06,
+ "loss": 0.105,
+ "step": 442
+ },
+ {
+ "epoch": 3.8859649122807016,
+ "grad_norm": 0.6996872425079346,
+ "learning_rate": 1.8226933408859864e-06,
+ "loss": 0.1083,
+ "step": 443
+ },
+ {
+ "epoch": 3.8947368421052633,
+ "grad_norm": 0.4114651679992676,
+ "learning_rate": 1.8097575672563278e-06,
+ "loss": 0.1003,
+ "step": 444
+ },
+ {
+ "epoch": 3.9035087719298245,
+ "grad_norm": 0.5234648585319519,
+ "learning_rate": 1.7968417680687666e-06,
+ "loss": 0.1019,
+ "step": 445
+ },
+ {
+ "epoch": 3.912280701754386,
+ "grad_norm": 1.0571491718292236,
+ "learning_rate": 1.7839463170845641e-06,
+ "loss": 0.1003,
+ "step": 446
+ },
+ {
+ "epoch": 3.9210526315789473,
+ "grad_norm": 0.7470094561576843,
+ "learning_rate": 1.7710715874761408e-06,
+ "loss": 0.1061,
+ "step": 447
+ },
+ {
+ "epoch": 3.9298245614035086,
+ "grad_norm": 0.901695191860199,
+ "learning_rate": 1.7582179518162742e-06,
+ "loss": 0.1015,
+ "step": 448
+ },
+ {
+ "epoch": 3.93859649122807,
+ "grad_norm": 1.0251179933547974,
+ "learning_rate": 1.7453857820673215e-06,
+ "loss": 0.1,
+ "step": 449
+ },
+ {
+ "epoch": 3.9473684210526314,
+ "grad_norm": 0.5065406560897827,
+ "learning_rate": 1.7325754495704508e-06,
+ "loss": 0.1036,
+ "step": 450
+ },
+ {
+ "epoch": 3.956140350877193,
+ "grad_norm": 0.9541155099868774,
+ "learning_rate": 1.7197873250348962e-06,
+ "loss": 0.1015,
+ "step": 451
+ },
+ {
+ "epoch": 3.9649122807017543,
+ "grad_norm": 0.6264199018478394,
+ "learning_rate": 1.7070217785272354e-06,
+ "loss": 0.1026,
+ "step": 452
+ },
+ {
+ "epoch": 3.973684210526316,
+ "grad_norm": 0.6260526180267334,
+ "learning_rate": 1.6942791794606716e-06,
+ "loss": 0.1039,
+ "step": 453
+ },
+ {
+ "epoch": 3.982456140350877,
+ "grad_norm": 0.4730931222438812,
+ "learning_rate": 1.681559896584352e-06,
+ "loss": 0.1045,
+ "step": 454
+ },
+ {
+ "epoch": 3.9912280701754383,
+ "grad_norm": 0.5011451840400696,
+ "learning_rate": 1.668864297972689e-06,
+ "loss": 0.1062,
+ "step": 455
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 1.0113046169281006,
+ "learning_rate": 1.6561927510147172e-06,
+ "loss": 0.1005,
+ "step": 456
+ },
+ {
+ "epoch": 4.008771929824562,
+ "grad_norm": 0.6017364263534546,
+ "learning_rate": 1.6435456224034536e-06,
+ "loss": 0.1042,
+ "step": 457
+ },
+ {
+ "epoch": 4.017543859649122,
+ "grad_norm": 0.6874931454658508,
+ "learning_rate": 1.63092327812529e-06,
+ "loss": 0.102,
+ "step": 458
+ },
+ {
+ "epoch": 4.026315789473684,
+ "grad_norm": 1.311024785041809,
+ "learning_rate": 1.6183260834494053e-06,
+ "loss": 0.1063,
+ "step": 459
+ },
+ {
+ "epoch": 4.035087719298246,
+ "grad_norm": 0.3640352785587311,
+ "learning_rate": 1.6057544029171863e-06,
+ "loss": 0.1039,
+ "step": 460
+ },
+ {
+ "epoch": 4.043859649122807,
+ "grad_norm": 0.6056526303291321,
+ "learning_rate": 1.5932086003316893e-06,
+ "loss": 0.099,
+ "step": 461
+ },
+ {
+ "epoch": 4.052631578947368,
+ "grad_norm": 0.5407683849334717,
+ "learning_rate": 1.5806890387471025e-06,
+ "loss": 0.1038,
+ "step": 462
+ },
+ {
+ "epoch": 4.06140350877193,
+ "grad_norm": 0.7054030895233154,
+ "learning_rate": 1.5681960804582474e-06,
+ "loss": 0.1001,
+ "step": 463
+ },
+ {
+ "epoch": 4.0701754385964914,
+ "grad_norm": 0.8736140727996826,
+ "learning_rate": 1.5557300869900876e-06,
+ "loss": 0.1035,
+ "step": 464
+ },
+ {
+ "epoch": 4.078947368421052,
+ "grad_norm": 0.6689419746398926,
+ "learning_rate": 1.5432914190872757e-06,
+ "loss": 0.1052,
+ "step": 465
+ },
+ {
+ "epoch": 4.087719298245614,
+ "grad_norm": 0.8937819600105286,
+ "learning_rate": 1.530880436703705e-06,
+ "loss": 0.1024,
+ "step": 466
+ },
+ {
+ "epoch": 4.0964912280701755,
+ "grad_norm": 0.24332484602928162,
+ "learning_rate": 1.518497498992097e-06,
+ "loss": 0.0984,
+ "step": 467
+ },
+ {
+ "epoch": 4.105263157894737,
+ "grad_norm": 0.9716914296150208,
+ "learning_rate": 1.5061429642936107e-06,
+ "loss": 0.1012,
+ "step": 468
+ },
+ {
+ "epoch": 4.114035087719298,
+ "grad_norm": 0.5864392518997192,
+ "learning_rate": 1.4938171901274678e-06,
+ "loss": 0.1029,
+ "step": 469
+ },
+ {
+ "epoch": 4.12280701754386,
+ "grad_norm": 0.4616212546825409,
+ "learning_rate": 1.4815205331806113e-06,
+ "loss": 0.1035,
+ "step": 470
+ },
+ {
+ "epoch": 4.131578947368421,
+ "grad_norm": 0.5989730954170227,
+ "learning_rate": 1.4692533492973775e-06,
+ "loss": 0.1036,
+ "step": 471
+ },
+ {
+ "epoch": 4.140350877192983,
+ "grad_norm": 0.7900629639625549,
+ "learning_rate": 1.4570159934692085e-06,
+ "loss": 0.1044,
+ "step": 472
+ },
+ {
+ "epoch": 4.149122807017544,
+ "grad_norm": 0.5659995675086975,
+ "learning_rate": 1.4448088198243668e-06,
+ "loss": 0.1024,
+ "step": 473
+ },
+ {
+ "epoch": 4.157894736842105,
+ "grad_norm": 0.7867873311042786,
+ "learning_rate": 1.432632181617698e-06,
+ "loss": 0.1038,
+ "step": 474
+ },
+ {
+ "epoch": 4.166666666666667,
+ "grad_norm": 0.44385358691215515,
+ "learning_rate": 1.4204864312204033e-06,
+ "loss": 0.1006,
+ "step": 475
+ },
+ {
+ "epoch": 4.175438596491228,
+ "grad_norm": 0.3909265697002411,
+ "learning_rate": 1.4083719201098404e-06,
+ "loss": 0.1019,
+ "step": 476
+ },
+ {
+ "epoch": 4.184210526315789,
+ "grad_norm": 0.7079223990440369,
+ "learning_rate": 1.3962889988593609e-06,
+ "loss": 0.1019,
+ "step": 477
+ },
+ {
+ "epoch": 4.192982456140351,
+ "grad_norm": 0.6703695058822632,
+ "learning_rate": 1.3842380171281522e-06,
+ "loss": 0.1063,
+ "step": 478
+ },
+ {
+ "epoch": 4.201754385964913,
+ "grad_norm": 0.3477051556110382,
+ "learning_rate": 1.3722193236511344e-06,
+ "loss": 0.1004,
+ "step": 479
+ },
+ {
+ "epoch": 4.2105263157894735,
+ "grad_norm": 0.7296048402786255,
+ "learning_rate": 1.3602332662288536e-06,
+ "loss": 0.1057,
+ "step": 480
+ },
+ {
+ "epoch": 4.219298245614035,
+ "grad_norm": 0.7007803916931152,
+ "learning_rate": 1.348280191717427e-06,
+ "loss": 0.1007,
+ "step": 481
+ },
+ {
+ "epoch": 4.228070175438597,
+ "grad_norm": 0.948968231678009,
+ "learning_rate": 1.3363604460185031e-06,
+ "loss": 0.1005,
+ "step": 482
+ },
+ {
+ "epoch": 4.2368421052631575,
+ "grad_norm": 0.6567812561988831,
+ "learning_rate": 1.3244743740692496e-06,
+ "loss": 0.1016,
+ "step": 483
+ },
+ {
+ "epoch": 4.245614035087719,
+ "grad_norm": 0.5390146374702454,
+ "learning_rate": 1.3126223198323752e-06,
+ "loss": 0.1025,
+ "step": 484
+ },
+ {
+ "epoch": 4.254385964912281,
+ "grad_norm": 0.43638724088668823,
+ "learning_rate": 1.3008046262861696e-06,
+ "loss": 0.1053,
+ "step": 485
+ },
+ {
+ "epoch": 4.2631578947368425,
+ "grad_norm": 0.43589839339256287,
+ "learning_rate": 1.289021635414589e-06,
+ "loss": 0.1036,
+ "step": 486
+ },
+ {
+ "epoch": 4.271929824561403,
+ "grad_norm": 0.3999694585800171,
+ "learning_rate": 1.277273688197346e-06,
+ "loss": 0.1023,
+ "step": 487
+ },
+ {
+ "epoch": 4.280701754385965,
+ "grad_norm": 0.6314297914505005,
+ "learning_rate": 1.265561124600057e-06,
+ "loss": 0.0993,
+ "step": 488
+ },
+ {
+ "epoch": 4.2894736842105265,
+ "grad_norm": 0.566033124923706,
+ "learning_rate": 1.2538842835643906e-06,
+ "loss": 0.1029,
+ "step": 489
+ },
+ {
+ "epoch": 4.298245614035087,
+ "grad_norm": 0.6713336110115051,
+ "learning_rate": 1.2422435029982669e-06,
+ "loss": 0.1002,
+ "step": 490
+ },
+ {
+ "epoch": 4.307017543859649,
+ "grad_norm": 0.428574800491333,
+ "learning_rate": 1.2306391197660797e-06,
+ "loss": 0.1028,
+ "step": 491
+ },
+ {
+ "epoch": 4.315789473684211,
+ "grad_norm": 0.637745201587677,
+ "learning_rate": 1.219071469678941e-06,
+ "loss": 0.1009,
+ "step": 492
+ },
+ {
+ "epoch": 4.324561403508772,
+ "grad_norm": 0.8204445242881775,
+ "learning_rate": 1.2075408874849747e-06,
+ "loss": 0.099,
+ "step": 493
+ },
+ {
+ "epoch": 4.333333333333333,
+ "grad_norm": 1.010758876800537,
+ "learning_rate": 1.1960477068596155e-06,
+ "loss": 0.1006,
+ "step": 494
+ },
+ {
+ "epoch": 4.342105263157895,
+ "grad_norm": 0.908112108707428,
+ "learning_rate": 1.1845922603959677e-06,
+ "loss": 0.1047,
+ "step": 495
+ },
+ {
+ "epoch": 4.350877192982456,
+ "grad_norm": 1.0254642963409424,
+ "learning_rate": 1.173174879595166e-06,
+ "loss": 0.0991,
+ "step": 496
+ },
+ {
+ "epoch": 4.359649122807017,
+ "grad_norm": 0.5159414410591125,
+ "learning_rate": 1.1617958948567967e-06,
+ "loss": 0.0978,
+ "step": 497
+ },
+ {
+ "epoch": 4.368421052631579,
+ "grad_norm": 0.9525816440582275,
+ "learning_rate": 1.1504556354693227e-06,
+ "loss": 0.1051,
+ "step": 498
+ },
+ {
+ "epoch": 4.37719298245614,
+ "grad_norm": 0.9321548938751221,
+ "learning_rate": 1.1391544296005652e-06,
+ "loss": 0.1011,
+ "step": 499
+ },
+ {
+ "epoch": 4.385964912280702,
+ "grad_norm": 0.7308889627456665,
+ "learning_rate": 1.1278926042882026e-06,
+ "loss": 0.1002,
+ "step": 500
+ },
+ {
+ "epoch": 4.394736842105263,
+ "grad_norm": 0.9508903622627258,
+ "learning_rate": 1.116670485430304e-06,
+ "loss": 0.1013,
+ "step": 501
+ },
+ {
+ "epoch": 4.4035087719298245,
+ "grad_norm": 0.5174031853675842,
+ "learning_rate": 1.1054883977759067e-06,
+ "loss": 0.104,
+ "step": 502
+ },
+ {
+ "epoch": 4.412280701754386,
+ "grad_norm": 0.4504610598087311,
+ "learning_rate": 1.0943466649156061e-06,
+ "loss": 0.1013,
+ "step": 503
+ },
+ {
+ "epoch": 4.421052631578947,
+ "grad_norm": 0.5650261044502258,
+ "learning_rate": 1.0832456092722063e-06,
+ "loss": 0.0995,
+ "step": 504
+ },
+ {
+ "epoch": 4.4298245614035086,
+ "grad_norm": 0.37759432196617126,
+ "learning_rate": 1.0721855520913751e-06,
+ "loss": 0.1058,
+ "step": 505
+ },
+ {
+ "epoch": 4.43859649122807,
+ "grad_norm": 0.7238495349884033,
+ "learning_rate": 1.0611668134323577e-06,
+ "loss": 0.1012,
+ "step": 506
+ },
+ {
+ "epoch": 4.447368421052632,
+ "grad_norm": 0.6301494240760803,
+ "learning_rate": 1.0501897121587127e-06,
+ "loss": 0.1009,
+ "step": 507
+ },
+ {
+ "epoch": 4.456140350877193,
+ "grad_norm": 0.9531002044677734,
+ "learning_rate": 1.0392545659290789e-06,
+ "loss": 0.1021,
+ "step": 508
+ },
+ {
+ "epoch": 4.464912280701754,
+ "grad_norm": 0.4423767924308777,
+ "learning_rate": 1.0283616911879943e-06,
+ "loss": 0.1024,
+ "step": 509
+ },
+ {
+ "epoch": 4.473684210526316,
+ "grad_norm": 0.5573019981384277,
+ "learning_rate": 1.0175114031567246e-06,
+ "loss": 0.1011,
+ "step": 510
+ },
+ {
+ "epoch": 4.482456140350878,
+ "grad_norm": 0.9792631268501282,
+ "learning_rate": 1.0067040158241555e-06,
+ "loss": 0.1039,
+ "step": 511
+ },
+ {
+ "epoch": 4.491228070175438,
+ "grad_norm": 1.7911303043365479,
+ "learning_rate": 9.95939841937693e-07,
+ "loss": 0.104,
+ "step": 512
+ },
+ {
+ "epoch": 4.5,
+ "grad_norm": 0.5825617909431458,
+ "learning_rate": 9.852191929942262e-07,
+ "loss": 0.0987,
+ "step": 513
+ },
+ {
+ "epoch": 4.508771929824562,
+ "grad_norm": 0.3129921555519104,
+ "learning_rate": 9.745423792310996e-07,
+ "loss": 0.0979,
+ "step": 514
+ },
+ {
+ "epoch": 4.517543859649123,
+ "grad_norm": 0.5376678705215454,
+ "learning_rate": 9.63909709617146e-07,
+ "loss": 0.0998,
+ "step": 515
+ },
+ {
+ "epoch": 4.526315789473684,
+ "grad_norm": 0.48920008540153503,
+ "learning_rate": 9.533214918437422e-07,
+ "loss": 0.1017,
+ "step": 516
+ },
+ {
+ "epoch": 4.535087719298246,
+ "grad_norm": 0.36829131841659546,
+ "learning_rate": 9.427780323159006e-07,
+ "loss": 0.1004,
+ "step": 517
+ },
+ {
+ "epoch": 4.543859649122807,
+ "grad_norm": 0.5459544658660889,
+ "learning_rate": 9.322796361434111e-07,
+ "loss": 0.1041,
+ "step": 518
+ },
+ {
+ "epoch": 4.552631578947368,
+ "grad_norm": 0.8460657000541687,
+ "learning_rate": 9.218266071320015e-07,
+ "loss": 0.1012,
+ "step": 519
+ },
+ {
+ "epoch": 4.56140350877193,
+ "grad_norm": 0.7692683339118958,
+ "learning_rate": 9.114192477745568e-07,
+ "loss": 0.1013,
+ "step": 520
+ },
+ {
+ "epoch": 4.5701754385964914,
+ "grad_norm": 0.4503592550754547,
+ "learning_rate": 9.010578592423544e-07,
+ "loss": 0.107,
+ "step": 521
+ },
+ {
+ "epoch": 4.578947368421053,
+ "grad_norm": 0.9348855018615723,
+ "learning_rate": 8.907427413763572e-07,
+ "loss": 0.102,
+ "step": 522
+ },
+ {
+ "epoch": 4.587719298245614,
+ "grad_norm": 0.7902988791465759,
+ "learning_rate": 8.804741926785335e-07,
+ "loss": 0.1032,
+ "step": 523
+ },
+ {
+ "epoch": 4.5964912280701755,
+ "grad_norm": 0.5444673299789429,
+ "learning_rate": 8.702525103032186e-07,
+ "loss": 0.0993,
+ "step": 524
+ },
+ {
+ "epoch": 4.605263157894737,
+ "grad_norm": 0.728112518787384,
+ "learning_rate": 8.60077990048517e-07,
+ "loss": 0.1021,
+ "step": 525
+ },
+ {
+ "epoch": 4.614035087719298,
+ "grad_norm": 0.5250695943832397,
+ "learning_rate": 8.499509263477388e-07,
+ "loss": 0.1018,
+ "step": 526
+ },
+ {
+ "epoch": 4.62280701754386,
+ "grad_norm": 0.3112829625606537,
+ "learning_rate": 8.398716122608868e-07,
+ "loss": 0.1037,
+ "step": 527
+ },
+ {
+ "epoch": 4.631578947368421,
+ "grad_norm": 0.9097342491149902,
+ "learning_rate": 8.298403394661658e-07,
+ "loss": 0.1015,
+ "step": 528
+ },
+ {
+ "epoch": 4.640350877192983,
+ "grad_norm": 0.6663810014724731,
+ "learning_rate": 8.198573982515537e-07,
+ "loss": 0.1038,
+ "step": 529
+ },
+ {
+ "epoch": 4.649122807017544,
+ "grad_norm": 1.1880309581756592,
+ "learning_rate": 8.099230775063879e-07,
+ "loss": 0.1044,
+ "step": 530
+ },
+ {
+ "epoch": 4.657894736842105,
+ "grad_norm": 0.6492993831634521,
+ "learning_rate": 8.000376647130165e-07,
+ "loss": 0.103,
+ "step": 531
+ },
+ {
+ "epoch": 4.666666666666667,
+ "grad_norm": 0.43723204731941223,
+ "learning_rate": 7.902014459384744e-07,
+ "loss": 0.1025,
+ "step": 532
+ },
+ {
+ "epoch": 4.675438596491228,
+ "grad_norm": 0.8422684669494629,
+ "learning_rate": 7.804147058262015e-07,
+ "loss": 0.1035,
+ "step": 533
+ },
+ {
+ "epoch": 4.684210526315789,
+ "grad_norm": 0.6502094268798828,
+ "learning_rate": 7.706777275878161e-07,
+ "loss": 0.0994,
+ "step": 534
+ },
+ {
+ "epoch": 4.692982456140351,
+ "grad_norm": 0.5709391236305237,
+ "learning_rate": 7.609907929949045e-07,
+ "loss": 0.1056,
+ "step": 535
+ },
+ {
+ "epoch": 4.701754385964913,
+ "grad_norm": 0.4126770496368408,
+ "learning_rate": 7.513541823708828e-07,
+ "loss": 0.101,
+ "step": 536
+ },
+ {
+ "epoch": 4.7105263157894735,
+ "grad_norm": 0.5016621947288513,
+ "learning_rate": 7.417681745828706e-07,
+ "loss": 0.0999,
+ "step": 537
+ },
+ {
+ "epoch": 4.719298245614035,
+ "grad_norm": 0.8139487504959106,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.0984,
+ "step": 538
+ },
+ {
+ "epoch": 4.728070175438597,
+ "grad_norm": 0.5805723667144775,
+ "learning_rate": 7.227490756535396e-07,
+ "loss": 0.1011,
+ "step": 539
+ },
+ {
+ "epoch": 4.7368421052631575,
+ "grad_norm": 0.7970795631408691,
+ "learning_rate": 7.133165348925978e-07,
+ "loss": 0.1016,
+ "step": 540
+ },
+ {
+ "epoch": 4.745614035087719,
+ "grad_norm": 0.6336880326271057,
+ "learning_rate": 7.039356977124937e-07,
+ "loss": 0.1027,
+ "step": 541
+ },
+ {
+ "epoch": 4.754385964912281,
+ "grad_norm": 0.2953254282474518,
+ "learning_rate": 6.946068355786992e-07,
+ "loss": 0.1022,
+ "step": 542
+ },
+ {
+ "epoch": 4.7631578947368425,
+ "grad_norm": 0.5646472573280334,
+ "learning_rate": 6.853302184526217e-07,
+ "loss": 0.0998,
+ "step": 543
+ },
+ {
+ "epoch": 4.771929824561403,
+ "grad_norm": 0.6545483469963074,
+ "learning_rate": 6.761061147837808e-07,
+ "loss": 0.0985,
+ "step": 544
+ },
+ {
+ "epoch": 4.780701754385965,
+ "grad_norm": 0.8741705417633057,
+ "learning_rate": 6.669347915020524e-07,
+ "loss": 0.1006,
+ "step": 545
+ },
+ {
+ "epoch": 4.7894736842105265,
+ "grad_norm": 0.8579487204551697,
+ "learning_rate": 6.578165140099318e-07,
+ "loss": 0.1037,
+ "step": 546
+ },
+ {
+ "epoch": 4.798245614035087,
+ "grad_norm": 1.0744833946228027,
+ "learning_rate": 6.487515461748631e-07,
+ "loss": 0.1017,
+ "step": 547
+ },
+ {
+ "epoch": 4.807017543859649,
+ "grad_norm": 0.4954414367675781,
+ "learning_rate": 6.397401503215992e-07,
+ "loss": 0.1006,
+ "step": 548
+ },
+ {
+ "epoch": 4.815789473684211,
+ "grad_norm": 0.525191068649292,
+ "learning_rate": 6.307825872246076e-07,
+ "loss": 0.1024,
+ "step": 549
+ },
+ {
+ "epoch": 4.824561403508772,
+ "grad_norm": 0.8922368288040161,
+ "learning_rate": 6.218791161005336e-07,
+ "loss": 0.0999,
+ "step": 550
+ },
+ {
+ "epoch": 4.833333333333333,
+ "grad_norm": 0.6471604704856873,
+ "learning_rate": 6.13029994600686e-07,
+ "loss": 0.0994,
+ "step": 551
+ },
+ {
+ "epoch": 4.842105263157895,
+ "grad_norm": 0.49826696515083313,
+ "learning_rate": 6.042354788035943e-07,
+ "loss": 0.1003,
+ "step": 552
+ },
+ {
+ "epoch": 4.850877192982456,
+ "grad_norm": 0.7908043265342712,
+ "learning_rate": 5.954958232075858e-07,
+ "loss": 0.1003,
+ "step": 553
+ },
+ {
+ "epoch": 4.859649122807017,
+ "grad_norm": 0.40011560916900635,
+ "learning_rate": 5.868112807234313e-07,
+ "loss": 0.0991,
+ "step": 554
+ },
+ {
+ "epoch": 4.868421052631579,
+ "grad_norm": 0.9797350764274597,
+ "learning_rate": 5.781821026670203e-07,
+ "loss": 0.1005,
+ "step": 555
+ },
+ {
+ "epoch": 4.87719298245614,
+ "grad_norm": 0.4581677317619324,
+ "learning_rate": 5.696085387520894e-07,
+ "loss": 0.1013,
+ "step": 556
+ },
+ {
+ "epoch": 4.885964912280702,
+ "grad_norm": 0.6596454381942749,
+ "learning_rate": 5.610908370829981e-07,
+ "loss": 0.1028,
+ "step": 557
+ },
+ {
+ "epoch": 4.894736842105263,
+ "grad_norm": 0.5106292963027954,
+ "learning_rate": 5.526292441475448e-07,
+ "loss": 0.1023,
+ "step": 558
+ },
+ {
+ "epoch": 4.9035087719298245,
+ "grad_norm": 0.5137461423873901,
+ "learning_rate": 5.442240048098402e-07,
+ "loss": 0.1036,
+ "step": 559
+ },
+ {
+ "epoch": 4.912280701754386,
+ "grad_norm": 0.4619182348251343,
+ "learning_rate": 5.358753623032137e-07,
+ "loss": 0.0979,
+ "step": 560
+ },
+ {
+ "epoch": 4.921052631578947,
+ "grad_norm": 0.5350770354270935,
+ "learning_rate": 5.275835582231833e-07,
+ "loss": 0.0992,
+ "step": 561
+ },
+ {
+ "epoch": 4.9298245614035086,
+ "grad_norm": 0.7599822878837585,
+ "learning_rate": 5.193488325204551e-07,
+ "loss": 0.0983,
+ "step": 562
+ },
+ {
+ "epoch": 4.93859649122807,
+ "grad_norm": 0.47537004947662354,
+ "learning_rate": 5.111714234939868e-07,
+ "loss": 0.1004,
+ "step": 563
+ },
+ {
+ "epoch": 4.947368421052632,
+ "grad_norm": 0.597273588180542,
+ "learning_rate": 5.030515677840883e-07,
+ "loss": 0.1015,
+ "step": 564
+ },
+ {
+ "epoch": 4.956140350877193,
+ "grad_norm": 0.7155528664588928,
+ "learning_rate": 4.949895003655728e-07,
+ "loss": 0.1017,
+ "step": 565
+ },
+ {
+ "epoch": 4.964912280701754,
+ "grad_norm": 0.530358612537384,
+ "learning_rate": 4.869854545409627e-07,
+ "loss": 0.0998,
+ "step": 566
+ },
+ {
+ "epoch": 4.973684210526316,
+ "grad_norm": 0.6721721291542053,
+ "learning_rate": 4.790396619337286e-07,
+ "loss": 0.1003,
+ "step": 567
+ },
+ {
+ "epoch": 4.982456140350877,
+ "grad_norm": 0.8486731648445129,
+ "learning_rate": 4.711523524815978e-07,
+ "loss": 0.0996,
+ "step": 568
+ },
+ {
+ "epoch": 4.991228070175438,
+ "grad_norm": 0.7072808742523193,
+ "learning_rate": 4.633237544298891e-07,
+ "loss": 0.1004,
+ "step": 569
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.41283953189849854,
+ "learning_rate": 4.555540943249187e-07,
+ "loss": 0.1026,
+ "step": 570
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.72999503707426e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-570/training_args.bin b/checkpoint-570/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-570/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-570/zero_to_fp32.py b/checkpoint-570/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-570/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-684/README.md b/checkpoint-684/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615
--- /dev/null
+++ b/checkpoint-684/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-684/adapter_config.json b/checkpoint-684/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be
--- /dev/null
+++ b/checkpoint-684/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "k_proj",
+ "q_proj",
+ "v_proj",
+ "down_proj",
+ "gate_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-684/adapter_model.safetensors b/checkpoint-684/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d071e8337a127c8780a346e6e69c4e2195786154
--- /dev/null
+++ b/checkpoint-684/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c08cabaa331365104eda0f955b3bcca40f58f5ba2408e03aedf9cc235c104191
+size 9016826528
diff --git a/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..149af1d5e1015eab2003622da67a889ac84b9518
--- /dev/null
+++ b/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e27fa02802a0dd6d25d10cf49d0c1a101925347cf6520883bf0b4d10b9d864
+size 27050164444
diff --git a/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11e077895710f8291da3f390a7bbeac47244c64f
--- /dev/null
+++ b/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:912be014e27defd74da10296591495afa96b599a5d2c146be81081413e6a81e4
+size 27050169884
diff --git a/checkpoint-684/global_step684/mp_rank_00_model_states.pt b/checkpoint-684/global_step684/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e688a60d4a45499a9a698c47b81b7b14df4c192f
--- /dev/null
+++ b/checkpoint-684/global_step684/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f5150b6fe3f1d5ba4817fa4a2a2b4f38d7d0cc97fc70f92b69861b6e3b7371
+size 9776788601
diff --git a/checkpoint-684/latest b/checkpoint-684/latest
new file mode 100644
index 0000000000000000000000000000000000000000..32b7f894d10e5e12f7ef9cea66d082aaff9baad6
--- /dev/null
+++ b/checkpoint-684/latest
@@ -0,0 +1 @@
+global_step684
\ No newline at end of file
diff --git a/checkpoint-684/rng_state_0.pth b/checkpoint-684/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e5def56c514e9207d48ed27325175e02388447eb
--- /dev/null
+++ b/checkpoint-684/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25cff4a257babccfd8e674add2d01ad4892c537ed897a74d1a9134b1885b4f7f
+size 14512
diff --git a/checkpoint-684/rng_state_1.pth b/checkpoint-684/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1d6231fc6e68165bc83edb42b0dd0d3bea65d
--- /dev/null
+++ b/checkpoint-684/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c864b15610207b2ff2d0d3d92423e5c186888dbcd07fc522ebe0404df39b8118
+size 14512
diff --git a/checkpoint-684/scheduler.pt b/checkpoint-684/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40438486cd88f508a260f58d9ab24bfa9cf84217
--- /dev/null
+++ b/checkpoint-684/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:413a7e9882fa261750972ae9e540d9a20775ad3cb6dc44fdda8e90c61665a5d3
+size 1064
diff --git a/checkpoint-684/special_tokens_map.json b/checkpoint-684/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-684/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-684/tokenizer.json b/checkpoint-684/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-684/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-684/tokenizer_config.json b/checkpoint-684/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/checkpoint-684/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-684/trainer_state.json b/checkpoint-684/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dba80755a03b104345d40314f07c9a10d1bbb79
--- /dev/null
+++ b/checkpoint-684/trainer_state.json
@@ -0,0 +1,4821 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 6.0,
+ "eval_steps": 500,
+ "global_step": 684,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.008771929824561403,
+ "grad_norm": 39.56407165527344,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 5.1375,
+ "step": 1
+ },
+ {
+ "epoch": 0.017543859649122806,
+ "grad_norm": 40.30452346801758,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 5.1185,
+ "step": 2
+ },
+ {
+ "epoch": 0.02631578947368421,
+ "grad_norm": 40.062313079833984,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 5.0762,
+ "step": 3
+ },
+ {
+ "epoch": 0.03508771929824561,
+ "grad_norm": 39.17148208618164,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 5.016,
+ "step": 4
+ },
+ {
+ "epoch": 0.043859649122807015,
+ "grad_norm": 40.67367172241211,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 5.0428,
+ "step": 5
+ },
+ {
+ "epoch": 0.05263157894736842,
+ "grad_norm": 38.18095016479492,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 5.2025,
+ "step": 6
+ },
+ {
+ "epoch": 0.06140350877192982,
+ "grad_norm": 39.12940979003906,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 4.9896,
+ "step": 7
+ },
+ {
+ "epoch": 0.07017543859649122,
+ "grad_norm": 38.84568405151367,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 5.1078,
+ "step": 8
+ },
+ {
+ "epoch": 0.07894736842105263,
+ "grad_norm": 39.38333511352539,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 5.0808,
+ "step": 9
+ },
+ {
+ "epoch": 0.08771929824561403,
+ "grad_norm": 39.427650451660156,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 5.0534,
+ "step": 10
+ },
+ {
+ "epoch": 0.09649122807017543,
+ "grad_norm": 39.29513168334961,
+ "learning_rate": 5.5e-07,
+ "loss": 5.058,
+ "step": 11
+ },
+ {
+ "epoch": 0.10526315789473684,
+ "grad_norm": 39.641231536865234,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 5.0317,
+ "step": 12
+ },
+ {
+ "epoch": 0.11403508771929824,
+ "grad_norm": 37.91259765625,
+ "learning_rate": 6.5e-07,
+ "loss": 4.912,
+ "step": 13
+ },
+ {
+ "epoch": 0.12280701754385964,
+ "grad_norm": 38.203548431396484,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 4.9705,
+ "step": 14
+ },
+ {
+ "epoch": 0.13157894736842105,
+ "grad_norm": 39.15998840332031,
+ "learning_rate": 7.5e-07,
+ "loss": 4.6962,
+ "step": 15
+ },
+ {
+ "epoch": 0.14035087719298245,
+ "grad_norm": 37.754669189453125,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 4.6262,
+ "step": 16
+ },
+ {
+ "epoch": 0.14912280701754385,
+ "grad_norm": 35.871490478515625,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 4.5422,
+ "step": 17
+ },
+ {
+ "epoch": 0.15789473684210525,
+ "grad_norm": 36.16888427734375,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 4.664,
+ "step": 18
+ },
+ {
+ "epoch": 0.16666666666666666,
+ "grad_norm": 33.520118713378906,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 4.4697,
+ "step": 19
+ },
+ {
+ "epoch": 0.17543859649122806,
+ "grad_norm": 30.896282196044922,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 4.3568,
+ "step": 20
+ },
+ {
+ "epoch": 0.18421052631578946,
+ "grad_norm": 29.944643020629883,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 4.2269,
+ "step": 21
+ },
+ {
+ "epoch": 0.19298245614035087,
+ "grad_norm": 25.224485397338867,
+ "learning_rate": 1.1e-06,
+ "loss": 4.1272,
+ "step": 22
+ },
+ {
+ "epoch": 0.20175438596491227,
+ "grad_norm": 24.410480499267578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 4.0585,
+ "step": 23
+ },
+ {
+ "epoch": 0.21052631578947367,
+ "grad_norm": 21.480648040771484,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 3.9472,
+ "step": 24
+ },
+ {
+ "epoch": 0.21929824561403508,
+ "grad_norm": 20.61946678161621,
+ "learning_rate": 1.25e-06,
+ "loss": 3.8879,
+ "step": 25
+ },
+ {
+ "epoch": 0.22807017543859648,
+ "grad_norm": 19.578271865844727,
+ "learning_rate": 1.3e-06,
+ "loss": 3.6783,
+ "step": 26
+ },
+ {
+ "epoch": 0.23684210526315788,
+ "grad_norm": 17.418983459472656,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 3.6826,
+ "step": 27
+ },
+ {
+ "epoch": 0.24561403508771928,
+ "grad_norm": 18.160301208496094,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 3.478,
+ "step": 28
+ },
+ {
+ "epoch": 0.2543859649122807,
+ "grad_norm": 17.573204040527344,
+ "learning_rate": 1.45e-06,
+ "loss": 3.459,
+ "step": 29
+ },
+ {
+ "epoch": 0.2631578947368421,
+ "grad_norm": 17.1265869140625,
+ "learning_rate": 1.5e-06,
+ "loss": 3.3999,
+ "step": 30
+ },
+ {
+ "epoch": 0.2719298245614035,
+ "grad_norm": 15.527145385742188,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 3.2817,
+ "step": 31
+ },
+ {
+ "epoch": 0.2807017543859649,
+ "grad_norm": 14.773847579956055,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 3.234,
+ "step": 32
+ },
+ {
+ "epoch": 0.2894736842105263,
+ "grad_norm": 12.039301872253418,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 3.132,
+ "step": 33
+ },
+ {
+ "epoch": 0.2982456140350877,
+ "grad_norm": 9.217979431152344,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 3.0548,
+ "step": 34
+ },
+ {
+ "epoch": 0.30701754385964913,
+ "grad_norm": 7.575639724731445,
+ "learning_rate": 1.75e-06,
+ "loss": 2.9529,
+ "step": 35
+ },
+ {
+ "epoch": 0.3157894736842105,
+ "grad_norm": 7.496004104614258,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 2.8967,
+ "step": 36
+ },
+ {
+ "epoch": 0.32456140350877194,
+ "grad_norm": 7.45414924621582,
+ "learning_rate": 1.85e-06,
+ "loss": 2.8837,
+ "step": 37
+ },
+ {
+ "epoch": 0.3333333333333333,
+ "grad_norm": 8.555658340454102,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 2.7473,
+ "step": 38
+ },
+ {
+ "epoch": 0.34210526315789475,
+ "grad_norm": 10.03805160522461,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 2.7355,
+ "step": 39
+ },
+ {
+ "epoch": 0.3508771929824561,
+ "grad_norm": 9.30649471282959,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 2.6587,
+ "step": 40
+ },
+ {
+ "epoch": 0.35964912280701755,
+ "grad_norm": 8.510339736938477,
+ "learning_rate": 2.05e-06,
+ "loss": 2.5977,
+ "step": 41
+ },
+ {
+ "epoch": 0.3684210526315789,
+ "grad_norm": 4.709080696105957,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 2.6286,
+ "step": 42
+ },
+ {
+ "epoch": 0.37719298245614036,
+ "grad_norm": 5.128961086273193,
+ "learning_rate": 2.15e-06,
+ "loss": 2.4558,
+ "step": 43
+ },
+ {
+ "epoch": 0.38596491228070173,
+ "grad_norm": 5.190136432647705,
+ "learning_rate": 2.2e-06,
+ "loss": 2.4432,
+ "step": 44
+ },
+ {
+ "epoch": 0.39473684210526316,
+ "grad_norm": 4.893551349639893,
+ "learning_rate": 2.25e-06,
+ "loss": 2.4939,
+ "step": 45
+ },
+ {
+ "epoch": 0.40350877192982454,
+ "grad_norm": 5.2434983253479,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 2.3381,
+ "step": 46
+ },
+ {
+ "epoch": 0.41228070175438597,
+ "grad_norm": 5.122412204742432,
+ "learning_rate": 2.35e-06,
+ "loss": 2.313,
+ "step": 47
+ },
+ {
+ "epoch": 0.42105263157894735,
+ "grad_norm": 4.577274799346924,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 2.2236,
+ "step": 48
+ },
+ {
+ "epoch": 0.4298245614035088,
+ "grad_norm": 4.722769737243652,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 2.1987,
+ "step": 49
+ },
+ {
+ "epoch": 0.43859649122807015,
+ "grad_norm": 5.059235095977783,
+ "learning_rate": 2.5e-06,
+ "loss": 2.1415,
+ "step": 50
+ },
+ {
+ "epoch": 0.4473684210526316,
+ "grad_norm": 4.454439640045166,
+ "learning_rate": 2.55e-06,
+ "loss": 2.0466,
+ "step": 51
+ },
+ {
+ "epoch": 0.45614035087719296,
+ "grad_norm": 4.94586706161499,
+ "learning_rate": 2.6e-06,
+ "loss": 1.8762,
+ "step": 52
+ },
+ {
+ "epoch": 0.4649122807017544,
+ "grad_norm": 4.704402446746826,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 1.8012,
+ "step": 53
+ },
+ {
+ "epoch": 0.47368421052631576,
+ "grad_norm": 6.125903129577637,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 1.7669,
+ "step": 54
+ },
+ {
+ "epoch": 0.4824561403508772,
+ "grad_norm": 4.5356059074401855,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 1.6607,
+ "step": 55
+ },
+ {
+ "epoch": 0.49122807017543857,
+ "grad_norm": 6.56803035736084,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 1.6291,
+ "step": 56
+ },
+ {
+ "epoch": 0.5,
+ "grad_norm": 4.910050392150879,
+ "learning_rate": 2.85e-06,
+ "loss": 1.5545,
+ "step": 57
+ },
+ {
+ "epoch": 0.5087719298245614,
+ "grad_norm": 8.733433723449707,
+ "learning_rate": 2.9e-06,
+ "loss": 1.4206,
+ "step": 58
+ },
+ {
+ "epoch": 0.5175438596491229,
+ "grad_norm": 8.582486152648926,
+ "learning_rate": 2.95e-06,
+ "loss": 1.3912,
+ "step": 59
+ },
+ {
+ "epoch": 0.5263157894736842,
+ "grad_norm": 13.710689544677734,
+ "learning_rate": 3e-06,
+ "loss": 1.3297,
+ "step": 60
+ },
+ {
+ "epoch": 0.5350877192982456,
+ "grad_norm": 23.400312423706055,
+ "learning_rate": 3.05e-06,
+ "loss": 1.296,
+ "step": 61
+ },
+ {
+ "epoch": 0.543859649122807,
+ "grad_norm": 5.678805351257324,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 1.2259,
+ "step": 62
+ },
+ {
+ "epoch": 0.5526315789473685,
+ "grad_norm": 14.700899124145508,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 1.1087,
+ "step": 63
+ },
+ {
+ "epoch": 0.5614035087719298,
+ "grad_norm": 19.38919448852539,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 1.1805,
+ "step": 64
+ },
+ {
+ "epoch": 0.5701754385964912,
+ "grad_norm": 8.460039138793945,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 1.0963,
+ "step": 65
+ },
+ {
+ "epoch": 0.5789473684210527,
+ "grad_norm": 13.371014595031738,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 1.0627,
+ "step": 66
+ },
+ {
+ "epoch": 0.5877192982456141,
+ "grad_norm": 22.380569458007812,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 1.0869,
+ "step": 67
+ },
+ {
+ "epoch": 0.5964912280701754,
+ "grad_norm": 5.780513286590576,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.9991,
+ "step": 68
+ },
+ {
+ "epoch": 0.6052631578947368,
+ "grad_norm": 19.850841522216797,
+ "learning_rate": 3.45e-06,
+ "loss": 0.9683,
+ "step": 69
+ },
+ {
+ "epoch": 0.6140350877192983,
+ "grad_norm": 17.160703659057617,
+ "learning_rate": 3.5e-06,
+ "loss": 0.845,
+ "step": 70
+ },
+ {
+ "epoch": 0.6228070175438597,
+ "grad_norm": 14.264311790466309,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.8059,
+ "step": 71
+ },
+ {
+ "epoch": 0.631578947368421,
+ "grad_norm": 26.39459991455078,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.85,
+ "step": 72
+ },
+ {
+ "epoch": 0.6403508771929824,
+ "grad_norm": 51.10348892211914,
+ "learning_rate": 3.65e-06,
+ "loss": 0.9755,
+ "step": 73
+ },
+ {
+ "epoch": 0.6491228070175439,
+ "grad_norm": 28.795856475830078,
+ "learning_rate": 3.7e-06,
+ "loss": 0.8966,
+ "step": 74
+ },
+ {
+ "epoch": 0.6578947368421053,
+ "grad_norm": 4.6617937088012695,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.7716,
+ "step": 75
+ },
+ {
+ "epoch": 0.6666666666666666,
+ "grad_norm": 15.729666709899902,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.7578,
+ "step": 76
+ },
+ {
+ "epoch": 0.6754385964912281,
+ "grad_norm": 7.109970569610596,
+ "learning_rate": 3.85e-06,
+ "loss": 0.7055,
+ "step": 77
+ },
+ {
+ "epoch": 0.6842105263157895,
+ "grad_norm": 20.84659194946289,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.7458,
+ "step": 78
+ },
+ {
+ "epoch": 0.6929824561403509,
+ "grad_norm": 21.601303100585938,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6879,
+ "step": 79
+ },
+ {
+ "epoch": 0.7017543859649122,
+ "grad_norm": 3.6914751529693604,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.6179,
+ "step": 80
+ },
+ {
+ "epoch": 0.7105263157894737,
+ "grad_norm": 16.539325714111328,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5716,
+ "step": 81
+ },
+ {
+ "epoch": 0.7192982456140351,
+ "grad_norm": 13.931925773620605,
+ "learning_rate": 4.1e-06,
+ "loss": 0.558,
+ "step": 82
+ },
+ {
+ "epoch": 0.7280701754385965,
+ "grad_norm": 10.52951717376709,
+ "learning_rate": 4.15e-06,
+ "loss": 0.6018,
+ "step": 83
+ },
+ {
+ "epoch": 0.7368421052631579,
+ "grad_norm": 17.337060928344727,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5501,
+ "step": 84
+ },
+ {
+ "epoch": 0.7456140350877193,
+ "grad_norm": 13.500468254089355,
+ "learning_rate": 4.25e-06,
+ "loss": 0.5214,
+ "step": 85
+ },
+ {
+ "epoch": 0.7543859649122807,
+ "grad_norm": 10.290645599365234,
+ "learning_rate": 4.3e-06,
+ "loss": 0.4996,
+ "step": 86
+ },
+ {
+ "epoch": 0.7631578947368421,
+ "grad_norm": 9.757556915283203,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.498,
+ "step": 87
+ },
+ {
+ "epoch": 0.7719298245614035,
+ "grad_norm": 9.325140953063965,
+ "learning_rate": 4.4e-06,
+ "loss": 0.4721,
+ "step": 88
+ },
+ {
+ "epoch": 0.7807017543859649,
+ "grad_norm": 2.9322128295898438,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.4528,
+ "step": 89
+ },
+ {
+ "epoch": 0.7894736842105263,
+ "grad_norm": 10.484073638916016,
+ "learning_rate": 4.5e-06,
+ "loss": 0.445,
+ "step": 90
+ },
+ {
+ "epoch": 0.7982456140350878,
+ "grad_norm": 32.7827262878418,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5105,
+ "step": 91
+ },
+ {
+ "epoch": 0.8070175438596491,
+ "grad_norm": 2.8477306365966797,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.4117,
+ "step": 92
+ },
+ {
+ "epoch": 0.8157894736842105,
+ "grad_norm": 2.7680225372314453,
+ "learning_rate": 4.65e-06,
+ "loss": 0.3653,
+ "step": 93
+ },
+ {
+ "epoch": 0.8245614035087719,
+ "grad_norm": 2.6512742042541504,
+ "learning_rate": 4.7e-06,
+ "loss": 0.3878,
+ "step": 94
+ },
+ {
+ "epoch": 0.8333333333333334,
+ "grad_norm": 6.453914165496826,
+ "learning_rate": 4.75e-06,
+ "loss": 0.3611,
+ "step": 95
+ },
+ {
+ "epoch": 0.8421052631578947,
+ "grad_norm": 3.4594080448150635,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.3817,
+ "step": 96
+ },
+ {
+ "epoch": 0.8508771929824561,
+ "grad_norm": 3.6144917011260986,
+ "learning_rate": 4.85e-06,
+ "loss": 0.3618,
+ "step": 97
+ },
+ {
+ "epoch": 0.8596491228070176,
+ "grad_norm": 5.349407196044922,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.3218,
+ "step": 98
+ },
+ {
+ "epoch": 0.868421052631579,
+ "grad_norm": 13.671236991882324,
+ "learning_rate": 4.95e-06,
+ "loss": 0.3329,
+ "step": 99
+ },
+ {
+ "epoch": 0.8771929824561403,
+ "grad_norm": 5.84046745300293,
+ "learning_rate": 5e-06,
+ "loss": 0.2967,
+ "step": 100
+ },
+ {
+ "epoch": 0.8859649122807017,
+ "grad_norm": 14.005338668823242,
+ "learning_rate": 4.999963827125897e-06,
+ "loss": 0.303,
+ "step": 101
+ },
+ {
+ "epoch": 0.8947368421052632,
+ "grad_norm": 9.18114185333252,
+ "learning_rate": 4.999855309550366e-06,
+ "loss": 0.2762,
+ "step": 102
+ },
+ {
+ "epoch": 0.9035087719298246,
+ "grad_norm": 3.0800487995147705,
+ "learning_rate": 4.999674450413725e-06,
+ "loss": 0.2628,
+ "step": 103
+ },
+ {
+ "epoch": 0.9122807017543859,
+ "grad_norm": 82.03578186035156,
+ "learning_rate": 4.999421254949728e-06,
+ "loss": 0.4065,
+ "step": 104
+ },
+ {
+ "epoch": 0.9210526315789473,
+ "grad_norm": 77.66315460205078,
+ "learning_rate": 4.99909573048542e-06,
+ "loss": 0.4307,
+ "step": 105
+ },
+ {
+ "epoch": 0.9298245614035088,
+ "grad_norm": 18.28767967224121,
+ "learning_rate": 4.998697886440927e-06,
+ "loss": 0.2571,
+ "step": 106
+ },
+ {
+ "epoch": 0.9385964912280702,
+ "grad_norm": 5.960445880889893,
+ "learning_rate": 4.998227734329177e-06,
+ "loss": 0.2847,
+ "step": 107
+ },
+ {
+ "epoch": 0.9473684210526315,
+ "grad_norm": 5.437699794769287,
+ "learning_rate": 4.9976852877555755e-06,
+ "loss": 0.2728,
+ "step": 108
+ },
+ {
+ "epoch": 0.956140350877193,
+ "grad_norm": 3.379631280899048,
+ "learning_rate": 4.997070562417602e-06,
+ "loss": 0.2467,
+ "step": 109
+ },
+ {
+ "epoch": 0.9649122807017544,
+ "grad_norm": 3.1625075340270996,
+ "learning_rate": 4.996383576104362e-06,
+ "loss": 0.2273,
+ "step": 110
+ },
+ {
+ "epoch": 0.9736842105263158,
+ "grad_norm": 15.588600158691406,
+ "learning_rate": 4.995624348696071e-06,
+ "loss": 0.2486,
+ "step": 111
+ },
+ {
+ "epoch": 0.9824561403508771,
+ "grad_norm": 2.631044387817383,
+ "learning_rate": 4.9947929021634815e-06,
+ "loss": 0.1964,
+ "step": 112
+ },
+ {
+ "epoch": 0.9912280701754386,
+ "grad_norm": 4.706504821777344,
+ "learning_rate": 4.993889260567239e-06,
+ "loss": 0.1901,
+ "step": 113
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 10.368465423583984,
+ "learning_rate": 4.9929134500571954e-06,
+ "loss": 0.1996,
+ "step": 114
+ },
+ {
+ "epoch": 1.0087719298245614,
+ "grad_norm": 30.44986343383789,
+ "learning_rate": 4.991865498871647e-06,
+ "loss": 0.2606,
+ "step": 115
+ },
+ {
+ "epoch": 1.0175438596491229,
+ "grad_norm": 14.421515464782715,
+ "learning_rate": 4.99074543733652e-06,
+ "loss": 0.2394,
+ "step": 116
+ },
+ {
+ "epoch": 1.0263157894736843,
+ "grad_norm": 14.072005271911621,
+ "learning_rate": 4.989553297864489e-06,
+ "loss": 0.2288,
+ "step": 117
+ },
+ {
+ "epoch": 1.0350877192982457,
+ "grad_norm": 4.395325660705566,
+ "learning_rate": 4.988289114954045e-06,
+ "loss": 0.2129,
+ "step": 118
+ },
+ {
+ "epoch": 1.043859649122807,
+ "grad_norm": 7.286703586578369,
+ "learning_rate": 4.986952925188489e-06,
+ "loss": 0.186,
+ "step": 119
+ },
+ {
+ "epoch": 1.0526315789473684,
+ "grad_norm": 8.332784652709961,
+ "learning_rate": 4.98554476723488e-06,
+ "loss": 0.178,
+ "step": 120
+ },
+ {
+ "epoch": 1.0614035087719298,
+ "grad_norm": 1.3646447658538818,
+ "learning_rate": 4.984064681842917e-06,
+ "loss": 0.1687,
+ "step": 121
+ },
+ {
+ "epoch": 1.0701754385964912,
+ "grad_norm": 4.494940757751465,
+ "learning_rate": 4.982512711843753e-06,
+ "loss": 0.1881,
+ "step": 122
+ },
+ {
+ "epoch": 1.0789473684210527,
+ "grad_norm": 3.3929836750030518,
+ "learning_rate": 4.980888902148757e-06,
+ "loss": 0.1764,
+ "step": 123
+ },
+ {
+ "epoch": 1.087719298245614,
+ "grad_norm": 1.8281155824661255,
+ "learning_rate": 4.979193299748225e-06,
+ "loss": 0.1602,
+ "step": 124
+ },
+ {
+ "epoch": 1.0964912280701755,
+ "grad_norm": 3.494239568710327,
+ "learning_rate": 4.977425953710005e-06,
+ "loss": 0.1729,
+ "step": 125
+ },
+ {
+ "epoch": 1.1052631578947367,
+ "grad_norm": 1.500410556793213,
+ "learning_rate": 4.975586915178084e-06,
+ "loss": 0.1666,
+ "step": 126
+ },
+ {
+ "epoch": 1.1140350877192982,
+ "grad_norm": 1.4680222272872925,
+ "learning_rate": 4.973676237371111e-06,
+ "loss": 0.159,
+ "step": 127
+ },
+ {
+ "epoch": 1.1228070175438596,
+ "grad_norm": 3.0383460521698,
+ "learning_rate": 4.971693975580851e-06,
+ "loss": 0.1484,
+ "step": 128
+ },
+ {
+ "epoch": 1.131578947368421,
+ "grad_norm": 3.74821138381958,
+ "learning_rate": 4.969640187170591e-06,
+ "loss": 0.1586,
+ "step": 129
+ },
+ {
+ "epoch": 1.1403508771929824,
+ "grad_norm": 4.682602405548096,
+ "learning_rate": 4.967514931573473e-06,
+ "loss": 0.1619,
+ "step": 130
+ },
+ {
+ "epoch": 1.1491228070175439,
+ "grad_norm": 3.90673565864563,
+ "learning_rate": 4.965318270290779e-06,
+ "loss": 0.164,
+ "step": 131
+ },
+ {
+ "epoch": 1.1578947368421053,
+ "grad_norm": 2.2017388343811035,
+ "learning_rate": 4.963050266890152e-06,
+ "loss": 0.1499,
+ "step": 132
+ },
+ {
+ "epoch": 1.1666666666666667,
+ "grad_norm": 2.4211816787719727,
+ "learning_rate": 4.960710987003753e-06,
+ "loss": 0.1387,
+ "step": 133
+ },
+ {
+ "epoch": 1.1754385964912282,
+ "grad_norm": 1.7753759622573853,
+ "learning_rate": 4.958300498326363e-06,
+ "loss": 0.1441,
+ "step": 134
+ },
+ {
+ "epoch": 1.1842105263157894,
+ "grad_norm": 1.5529910326004028,
+ "learning_rate": 4.955818870613425e-06,
+ "loss": 0.1304,
+ "step": 135
+ },
+ {
+ "epoch": 1.1929824561403508,
+ "grad_norm": 2.090593099594116,
+ "learning_rate": 4.953266175679023e-06,
+ "loss": 0.1419,
+ "step": 136
+ },
+ {
+ "epoch": 1.2017543859649122,
+ "grad_norm": 2.7141878604888916,
+ "learning_rate": 4.95064248739381e-06,
+ "loss": 0.1444,
+ "step": 137
+ },
+ {
+ "epoch": 1.2105263157894737,
+ "grad_norm": 2.3690481185913086,
+ "learning_rate": 4.947947881682861e-06,
+ "loss": 0.1383,
+ "step": 138
+ },
+ {
+ "epoch": 1.219298245614035,
+ "grad_norm": 2.2403147220611572,
+ "learning_rate": 4.945182436523482e-06,
+ "loss": 0.1418,
+ "step": 139
+ },
+ {
+ "epoch": 1.2280701754385965,
+ "grad_norm": 1.3939160108566284,
+ "learning_rate": 4.942346231942955e-06,
+ "loss": 0.1307,
+ "step": 140
+ },
+ {
+ "epoch": 1.236842105263158,
+ "grad_norm": 11.276732444763184,
+ "learning_rate": 4.939439350016214e-06,
+ "loss": 0.1397,
+ "step": 141
+ },
+ {
+ "epoch": 1.2456140350877192,
+ "grad_norm": 8.260516166687012,
+ "learning_rate": 4.9364618748634794e-06,
+ "loss": 0.1426,
+ "step": 142
+ },
+ {
+ "epoch": 1.2543859649122808,
+ "grad_norm": 2.09720516204834,
+ "learning_rate": 4.933413892647819e-06,
+ "loss": 0.1323,
+ "step": 143
+ },
+ {
+ "epoch": 1.263157894736842,
+ "grad_norm": 1.802125334739685,
+ "learning_rate": 4.9302954915726535e-06,
+ "loss": 0.1304,
+ "step": 144
+ },
+ {
+ "epoch": 1.2719298245614035,
+ "grad_norm": 1.7151471376419067,
+ "learning_rate": 4.927106761879207e-06,
+ "loss": 0.1264,
+ "step": 145
+ },
+ {
+ "epoch": 1.280701754385965,
+ "grad_norm": 1.6970336437225342,
+ "learning_rate": 4.923847795843894e-06,
+ "loss": 0.1227,
+ "step": 146
+ },
+ {
+ "epoch": 1.2894736842105263,
+ "grad_norm": 16.60441017150879,
+ "learning_rate": 4.920518687775647e-06,
+ "loss": 0.1606,
+ "step": 147
+ },
+ {
+ "epoch": 1.2982456140350878,
+ "grad_norm": 6.470354080200195,
+ "learning_rate": 4.917119534013194e-06,
+ "loss": 0.1447,
+ "step": 148
+ },
+ {
+ "epoch": 1.3070175438596492,
+ "grad_norm": 1.4908231496810913,
+ "learning_rate": 4.913650432922264e-06,
+ "loss": 0.1343,
+ "step": 149
+ },
+ {
+ "epoch": 1.3157894736842106,
+ "grad_norm": 3.19964861869812,
+ "learning_rate": 4.91011148489274e-06,
+ "loss": 0.1354,
+ "step": 150
+ },
+ {
+ "epoch": 1.3245614035087718,
+ "grad_norm": 2.6052839756011963,
+ "learning_rate": 4.906502792335761e-06,
+ "loss": 0.1342,
+ "step": 151
+ },
+ {
+ "epoch": 1.3333333333333333,
+ "grad_norm": 2.0719165802001953,
+ "learning_rate": 4.9028244596807525e-06,
+ "loss": 0.1359,
+ "step": 152
+ },
+ {
+ "epoch": 1.3421052631578947,
+ "grad_norm": 0.8086919784545898,
+ "learning_rate": 4.899076593372405e-06,
+ "loss": 0.1279,
+ "step": 153
+ },
+ {
+ "epoch": 1.3508771929824561,
+ "grad_norm": 1.0056848526000977,
+ "learning_rate": 4.8952593018675955e-06,
+ "loss": 0.1162,
+ "step": 154
+ },
+ {
+ "epoch": 1.3596491228070176,
+ "grad_norm": 5.72553014755249,
+ "learning_rate": 4.891372695632249e-06,
+ "loss": 0.1315,
+ "step": 155
+ },
+ {
+ "epoch": 1.368421052631579,
+ "grad_norm": 1.522894024848938,
+ "learning_rate": 4.887416887138139e-06,
+ "loss": 0.1266,
+ "step": 156
+ },
+ {
+ "epoch": 1.3771929824561404,
+ "grad_norm": 2.019472122192383,
+ "learning_rate": 4.883391990859635e-06,
+ "loss": 0.1262,
+ "step": 157
+ },
+ {
+ "epoch": 1.3859649122807016,
+ "grad_norm": 1.8594422340393066,
+ "learning_rate": 4.879298123270391e-06,
+ "loss": 0.125,
+ "step": 158
+ },
+ {
+ "epoch": 1.3947368421052633,
+ "grad_norm": 1.365377426147461,
+ "learning_rate": 4.8751354028399725e-06,
+ "loss": 0.1218,
+ "step": 159
+ },
+ {
+ "epoch": 1.4035087719298245,
+ "grad_norm": 3.553309917449951,
+ "learning_rate": 4.870903950030429e-06,
+ "loss": 0.1272,
+ "step": 160
+ },
+ {
+ "epoch": 1.412280701754386,
+ "grad_norm": 2.1770920753479004,
+ "learning_rate": 4.866603887292809e-06,
+ "loss": 0.1213,
+ "step": 161
+ },
+ {
+ "epoch": 1.4210526315789473,
+ "grad_norm": 1.6058955192565918,
+ "learning_rate": 4.862235339063613e-06,
+ "loss": 0.1173,
+ "step": 162
+ },
+ {
+ "epoch": 1.4298245614035088,
+ "grad_norm": 1.3208314180374146,
+ "learning_rate": 4.857798431761199e-06,
+ "loss": 0.1183,
+ "step": 163
+ },
+ {
+ "epoch": 1.4385964912280702,
+ "grad_norm": 1.282729983329773,
+ "learning_rate": 4.853293293782118e-06,
+ "loss": 0.1209,
+ "step": 164
+ },
+ {
+ "epoch": 1.4473684210526316,
+ "grad_norm": 1.3838152885437012,
+ "learning_rate": 4.848720055497401e-06,
+ "loss": 0.1198,
+ "step": 165
+ },
+ {
+ "epoch": 1.456140350877193,
+ "grad_norm": 1.2930737733840942,
+ "learning_rate": 4.844078849248785e-06,
+ "loss": 0.1268,
+ "step": 166
+ },
+ {
+ "epoch": 1.4649122807017543,
+ "grad_norm": 1.7022266387939453,
+ "learning_rate": 4.839369809344888e-06,
+ "loss": 0.1198,
+ "step": 167
+ },
+ {
+ "epoch": 1.4736842105263157,
+ "grad_norm": 1.0927815437316895,
+ "learning_rate": 4.834593072057313e-06,
+ "loss": 0.1132,
+ "step": 168
+ },
+ {
+ "epoch": 1.4824561403508771,
+ "grad_norm": 0.9326333999633789,
+ "learning_rate": 4.829748775616716e-06,
+ "loss": 0.1193,
+ "step": 169
+ },
+ {
+ "epoch": 1.4912280701754386,
+ "grad_norm": 1.3564742803573608,
+ "learning_rate": 4.8248370602087954e-06,
+ "loss": 0.118,
+ "step": 170
+ },
+ {
+ "epoch": 1.5,
+ "grad_norm": 1.19778573513031,
+ "learning_rate": 4.819858067970243e-06,
+ "loss": 0.1122,
+ "step": 171
+ },
+ {
+ "epoch": 1.5087719298245614,
+ "grad_norm": 2.8438351154327393,
+ "learning_rate": 4.814811942984625e-06,
+ "loss": 0.1217,
+ "step": 172
+ },
+ {
+ "epoch": 1.5175438596491229,
+ "grad_norm": 1.0701063871383667,
+ "learning_rate": 4.809698831278217e-06,
+ "loss": 0.1114,
+ "step": 173
+ },
+ {
+ "epoch": 1.526315789473684,
+ "grad_norm": 0.9053553938865662,
+ "learning_rate": 4.804518880815776e-06,
+ "loss": 0.1178,
+ "step": 174
+ },
+ {
+ "epoch": 1.5350877192982457,
+ "grad_norm": 0.42274603247642517,
+ "learning_rate": 4.799272241496259e-06,
+ "loss": 0.1091,
+ "step": 175
+ },
+ {
+ "epoch": 1.543859649122807,
+ "grad_norm": 0.8576470017433167,
+ "learning_rate": 4.793959065148484e-06,
+ "loss": 0.1134,
+ "step": 176
+ },
+ {
+ "epoch": 1.5526315789473686,
+ "grad_norm": 0.5910662412643433,
+ "learning_rate": 4.78857950552674e-06,
+ "loss": 0.1148,
+ "step": 177
+ },
+ {
+ "epoch": 1.5614035087719298,
+ "grad_norm": 0.8761632442474365,
+ "learning_rate": 4.783133718306331e-06,
+ "loss": 0.1125,
+ "step": 178
+ },
+ {
+ "epoch": 1.5701754385964912,
+ "grad_norm": 1.9190795421600342,
+ "learning_rate": 4.777621861079079e-06,
+ "loss": 0.1148,
+ "step": 179
+ },
+ {
+ "epoch": 1.5789473684210527,
+ "grad_norm": 0.6199957728385925,
+ "learning_rate": 4.772044093348757e-06,
+ "loss": 0.1097,
+ "step": 180
+ },
+ {
+ "epoch": 1.587719298245614,
+ "grad_norm": 1.562089443206787,
+ "learning_rate": 4.766400576526479e-06,
+ "loss": 0.1097,
+ "step": 181
+ },
+ {
+ "epoch": 1.5964912280701755,
+ "grad_norm": 1.4957091808319092,
+ "learning_rate": 4.760691473926021e-06,
+ "loss": 0.1216,
+ "step": 182
+ },
+ {
+ "epoch": 1.6052631578947367,
+ "grad_norm": 0.9863570332527161,
+ "learning_rate": 4.754916950759105e-06,
+ "loss": 0.1122,
+ "step": 183
+ },
+ {
+ "epoch": 1.6140350877192984,
+ "grad_norm": 0.5803346633911133,
+ "learning_rate": 4.749077174130609e-06,
+ "loss": 0.1103,
+ "step": 184
+ },
+ {
+ "epoch": 1.6228070175438596,
+ "grad_norm": 1.8789891004562378,
+ "learning_rate": 4.743172313033738e-06,
+ "loss": 0.1191,
+ "step": 185
+ },
+ {
+ "epoch": 1.631578947368421,
+ "grad_norm": 0.8731380105018616,
+ "learning_rate": 4.7372025383451285e-06,
+ "loss": 0.1154,
+ "step": 186
+ },
+ {
+ "epoch": 1.6403508771929824,
+ "grad_norm": 1.3535627126693726,
+ "learning_rate": 4.7311680228199075e-06,
+ "loss": 0.1123,
+ "step": 187
+ },
+ {
+ "epoch": 1.6491228070175439,
+ "grad_norm": 0.7211089134216309,
+ "learning_rate": 4.725068941086693e-06,
+ "loss": 0.1134,
+ "step": 188
+ },
+ {
+ "epoch": 1.6578947368421053,
+ "grad_norm": 1.4752328395843506,
+ "learning_rate": 4.718905469642534e-06,
+ "loss": 0.1185,
+ "step": 189
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 0.9822680354118347,
+ "learning_rate": 4.712677786847814e-06,
+ "loss": 0.1146,
+ "step": 190
+ },
+ {
+ "epoch": 1.6754385964912282,
+ "grad_norm": 1.1308330297470093,
+ "learning_rate": 4.706386072921083e-06,
+ "loss": 0.1061,
+ "step": 191
+ },
+ {
+ "epoch": 1.6842105263157894,
+ "grad_norm": 5.331939697265625,
+ "learning_rate": 4.70003050993384e-06,
+ "loss": 0.1153,
+ "step": 192
+ },
+ {
+ "epoch": 1.692982456140351,
+ "grad_norm": 0.6911673545837402,
+ "learning_rate": 4.6936112818052674e-06,
+ "loss": 0.1098,
+ "step": 193
+ },
+ {
+ "epoch": 1.7017543859649122,
+ "grad_norm": 0.5160980224609375,
+ "learning_rate": 4.687128574296912e-06,
+ "loss": 0.1073,
+ "step": 194
+ },
+ {
+ "epoch": 1.7105263157894737,
+ "grad_norm": 1.5724798440933228,
+ "learning_rate": 4.680582575007303e-06,
+ "loss": 0.121,
+ "step": 195
+ },
+ {
+ "epoch": 1.719298245614035,
+ "grad_norm": 1.3960011005401611,
+ "learning_rate": 4.6739734733665275e-06,
+ "loss": 0.1145,
+ "step": 196
+ },
+ {
+ "epoch": 1.7280701754385965,
+ "grad_norm": 1.4949183464050293,
+ "learning_rate": 4.6673014606307465e-06,
+ "loss": 0.1166,
+ "step": 197
+ },
+ {
+ "epoch": 1.736842105263158,
+ "grad_norm": 1.6873422861099243,
+ "learning_rate": 4.660566729876661e-06,
+ "loss": 0.1115,
+ "step": 198
+ },
+ {
+ "epoch": 1.7456140350877192,
+ "grad_norm": 1.3443641662597656,
+ "learning_rate": 4.653769475995926e-06,
+ "loss": 0.1119,
+ "step": 199
+ },
+ {
+ "epoch": 1.7543859649122808,
+ "grad_norm": 0.807525098323822,
+ "learning_rate": 4.646909895689508e-06,
+ "loss": 0.1059,
+ "step": 200
+ },
+ {
+ "epoch": 1.763157894736842,
+ "grad_norm": 1.589316964149475,
+ "learning_rate": 4.639988187461995e-06,
+ "loss": 0.1151,
+ "step": 201
+ },
+ {
+ "epoch": 1.7719298245614035,
+ "grad_norm": 2.474756956100464,
+ "learning_rate": 4.633004551615851e-06,
+ "loss": 0.116,
+ "step": 202
+ },
+ {
+ "epoch": 1.780701754385965,
+ "grad_norm": 0.6210195422172546,
+ "learning_rate": 4.62595919024562e-06,
+ "loss": 0.1097,
+ "step": 203
+ },
+ {
+ "epoch": 1.7894736842105263,
+ "grad_norm": 0.7217905521392822,
+ "learning_rate": 4.618852307232078e-06,
+ "loss": 0.1117,
+ "step": 204
+ },
+ {
+ "epoch": 1.7982456140350878,
+ "grad_norm": 1.551251769065857,
+ "learning_rate": 4.611684108236334e-06,
+ "loss": 0.113,
+ "step": 205
+ },
+ {
+ "epoch": 1.807017543859649,
+ "grad_norm": 0.6619828939437866,
+ "learning_rate": 4.604454800693874e-06,
+ "loss": 0.113,
+ "step": 206
+ },
+ {
+ "epoch": 1.8157894736842106,
+ "grad_norm": 0.9461805820465088,
+ "learning_rate": 4.597164593808564e-06,
+ "loss": 0.1093,
+ "step": 207
+ },
+ {
+ "epoch": 1.8245614035087718,
+ "grad_norm": 1.2926547527313232,
+ "learning_rate": 4.589813698546592e-06,
+ "loss": 0.1128,
+ "step": 208
+ },
+ {
+ "epoch": 1.8333333333333335,
+ "grad_norm": 0.8754212856292725,
+ "learning_rate": 4.582402327630368e-06,
+ "loss": 0.1104,
+ "step": 209
+ },
+ {
+ "epoch": 1.8421052631578947,
+ "grad_norm": 0.846051812171936,
+ "learning_rate": 4.574930695532357e-06,
+ "loss": 0.1105,
+ "step": 210
+ },
+ {
+ "epoch": 1.8508771929824561,
+ "grad_norm": 1.3332515954971313,
+ "learning_rate": 4.567399018468889e-06,
+ "loss": 0.1101,
+ "step": 211
+ },
+ {
+ "epoch": 1.8596491228070176,
+ "grad_norm": 0.8729192614555359,
+ "learning_rate": 4.5598075143938855e-06,
+ "loss": 0.1081,
+ "step": 212
+ },
+ {
+ "epoch": 1.868421052631579,
+ "grad_norm": 0.8618345260620117,
+ "learning_rate": 4.552156402992567e-06,
+ "loss": 0.1059,
+ "step": 213
+ },
+ {
+ "epoch": 1.8771929824561404,
+ "grad_norm": 1.2135930061340332,
+ "learning_rate": 4.544445905675082e-06,
+ "loss": 0.1105,
+ "step": 214
+ },
+ {
+ "epoch": 1.8859649122807016,
+ "grad_norm": 0.8405666351318359,
+ "learning_rate": 4.536676245570111e-06,
+ "loss": 0.1118,
+ "step": 215
+ },
+ {
+ "epoch": 1.8947368421052633,
+ "grad_norm": 0.42860639095306396,
+ "learning_rate": 4.528847647518403e-06,
+ "loss": 0.1093,
+ "step": 216
+ },
+ {
+ "epoch": 1.9035087719298245,
+ "grad_norm": 1.1538206338882446,
+ "learning_rate": 4.520960338066271e-06,
+ "loss": 0.1088,
+ "step": 217
+ },
+ {
+ "epoch": 1.912280701754386,
+ "grad_norm": 0.5870749354362488,
+ "learning_rate": 4.513014545459038e-06,
+ "loss": 0.1061,
+ "step": 218
+ },
+ {
+ "epoch": 1.9210526315789473,
+ "grad_norm": 0.7279748916625977,
+ "learning_rate": 4.505010499634427e-06,
+ "loss": 0.1032,
+ "step": 219
+ },
+ {
+ "epoch": 1.9298245614035088,
+ "grad_norm": 0.6331414580345154,
+ "learning_rate": 4.4969484322159125e-06,
+ "loss": 0.1109,
+ "step": 220
+ },
+ {
+ "epoch": 1.9385964912280702,
+ "grad_norm": 0.9024543166160583,
+ "learning_rate": 4.488828576506014e-06,
+ "loss": 0.1094,
+ "step": 221
+ },
+ {
+ "epoch": 1.9473684210526314,
+ "grad_norm": 3.540376901626587,
+ "learning_rate": 4.480651167479545e-06,
+ "loss": 0.1154,
+ "step": 222
+ },
+ {
+ "epoch": 1.956140350877193,
+ "grad_norm": 0.9506739377975464,
+ "learning_rate": 4.472416441776817e-06,
+ "loss": 0.108,
+ "step": 223
+ },
+ {
+ "epoch": 1.9649122807017543,
+ "grad_norm": 0.6585081815719604,
+ "learning_rate": 4.464124637696786e-06,
+ "loss": 0.1033,
+ "step": 224
+ },
+ {
+ "epoch": 1.973684210526316,
+ "grad_norm": 1.143038034439087,
+ "learning_rate": 4.455775995190161e-06,
+ "loss": 0.1092,
+ "step": 225
+ },
+ {
+ "epoch": 1.9824561403508771,
+ "grad_norm": 1.148261547088623,
+ "learning_rate": 4.4473707558524555e-06,
+ "loss": 0.1076,
+ "step": 226
+ },
+ {
+ "epoch": 1.9912280701754386,
+ "grad_norm": 0.7375811338424683,
+ "learning_rate": 4.438909162917003e-06,
+ "loss": 0.108,
+ "step": 227
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.5254591703414917,
+ "learning_rate": 4.430391461247911e-06,
+ "loss": 0.1079,
+ "step": 228
+ },
+ {
+ "epoch": 2.008771929824561,
+ "grad_norm": 1.0198495388031006,
+ "learning_rate": 4.42181789733298e-06,
+ "loss": 0.1083,
+ "step": 229
+ },
+ {
+ "epoch": 2.017543859649123,
+ "grad_norm": 0.9234157800674438,
+ "learning_rate": 4.413188719276569e-06,
+ "loss": 0.1084,
+ "step": 230
+ },
+ {
+ "epoch": 2.026315789473684,
+ "grad_norm": 0.5215068459510803,
+ "learning_rate": 4.404504176792414e-06,
+ "loss": 0.1067,
+ "step": 231
+ },
+ {
+ "epoch": 2.0350877192982457,
+ "grad_norm": 0.9296736121177673,
+ "learning_rate": 4.3957645211964065e-06,
+ "loss": 0.1066,
+ "step": 232
+ },
+ {
+ "epoch": 2.043859649122807,
+ "grad_norm": 0.8660671710968018,
+ "learning_rate": 4.386970005399314e-06,
+ "loss": 0.108,
+ "step": 233
+ },
+ {
+ "epoch": 2.0526315789473686,
+ "grad_norm": 0.6014883518218994,
+ "learning_rate": 4.378120883899467e-06,
+ "loss": 0.1068,
+ "step": 234
+ },
+ {
+ "epoch": 2.06140350877193,
+ "grad_norm": 0.6370371580123901,
+ "learning_rate": 4.369217412775393e-06,
+ "loss": 0.1076,
+ "step": 235
+ },
+ {
+ "epoch": 2.0701754385964914,
+ "grad_norm": 0.9806828498840332,
+ "learning_rate": 4.360259849678402e-06,
+ "loss": 0.1071,
+ "step": 236
+ },
+ {
+ "epoch": 2.0789473684210527,
+ "grad_norm": 0.6093440651893616,
+ "learning_rate": 4.351248453825137e-06,
+ "loss": 0.1038,
+ "step": 237
+ },
+ {
+ "epoch": 2.087719298245614,
+ "grad_norm": 1.3494842052459717,
+ "learning_rate": 4.3421834859900695e-06,
+ "loss": 0.1105,
+ "step": 238
+ },
+ {
+ "epoch": 2.0964912280701755,
+ "grad_norm": 0.7621576189994812,
+ "learning_rate": 4.333065208497949e-06,
+ "loss": 0.1048,
+ "step": 239
+ },
+ {
+ "epoch": 2.1052631578947367,
+ "grad_norm": 0.5918282866477966,
+ "learning_rate": 4.3238938852162195e-06,
+ "loss": 0.1086,
+ "step": 240
+ },
+ {
+ "epoch": 2.1140350877192984,
+ "grad_norm": 0.7048676609992981,
+ "learning_rate": 4.314669781547379e-06,
+ "loss": 0.1061,
+ "step": 241
+ },
+ {
+ "epoch": 2.1228070175438596,
+ "grad_norm": 1.0750821828842163,
+ "learning_rate": 4.305393164421301e-06,
+ "loss": 0.1082,
+ "step": 242
+ },
+ {
+ "epoch": 2.1315789473684212,
+ "grad_norm": 0.6171414852142334,
+ "learning_rate": 4.296064302287507e-06,
+ "loss": 0.1039,
+ "step": 243
+ },
+ {
+ "epoch": 2.1403508771929824,
+ "grad_norm": 0.8080905079841614,
+ "learning_rate": 4.286683465107403e-06,
+ "loss": 0.1069,
+ "step": 244
+ },
+ {
+ "epoch": 2.1491228070175437,
+ "grad_norm": 0.5281466245651245,
+ "learning_rate": 4.277250924346461e-06,
+ "loss": 0.1069,
+ "step": 245
+ },
+ {
+ "epoch": 2.1578947368421053,
+ "grad_norm": 0.8070254325866699,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.1061,
+ "step": 246
+ },
+ {
+ "epoch": 2.1666666666666665,
+ "grad_norm": 0.8560577630996704,
+ "learning_rate": 4.25823182541713e-06,
+ "loss": 0.1116,
+ "step": 247
+ },
+ {
+ "epoch": 2.175438596491228,
+ "grad_norm": 0.7772330045700073,
+ "learning_rate": 4.2486458176291176e-06,
+ "loss": 0.1092,
+ "step": 248
+ },
+ {
+ "epoch": 2.1842105263157894,
+ "grad_norm": 0.814601719379425,
+ "learning_rate": 4.239009207005096e-06,
+ "loss": 0.1093,
+ "step": 249
+ },
+ {
+ "epoch": 2.192982456140351,
+ "grad_norm": 0.957789957523346,
+ "learning_rate": 4.2293222724121855e-06,
+ "loss": 0.1075,
+ "step": 250
+ },
+ {
+ "epoch": 2.2017543859649122,
+ "grad_norm": 0.500062108039856,
+ "learning_rate": 4.219585294173799e-06,
+ "loss": 0.1048,
+ "step": 251
+ },
+ {
+ "epoch": 2.2105263157894735,
+ "grad_norm": 0.3866419792175293,
+ "learning_rate": 4.209798554061527e-06,
+ "loss": 0.1074,
+ "step": 252
+ },
+ {
+ "epoch": 2.219298245614035,
+ "grad_norm": 1.1853291988372803,
+ "learning_rate": 4.199962335286985e-06,
+ "loss": 0.1076,
+ "step": 253
+ },
+ {
+ "epoch": 2.2280701754385963,
+ "grad_norm": 0.36602887511253357,
+ "learning_rate": 4.1900769224936125e-06,
+ "loss": 0.108,
+ "step": 254
+ },
+ {
+ "epoch": 2.236842105263158,
+ "grad_norm": 0.2530711889266968,
+ "learning_rate": 4.180142601748447e-06,
+ "loss": 0.1041,
+ "step": 255
+ },
+ {
+ "epoch": 2.245614035087719,
+ "grad_norm": 1.3067054748535156,
+ "learning_rate": 4.170159660533834e-06,
+ "loss": 0.1087,
+ "step": 256
+ },
+ {
+ "epoch": 2.254385964912281,
+ "grad_norm": 0.3442043960094452,
+ "learning_rate": 4.160128387739114e-06,
+ "loss": 0.1099,
+ "step": 257
+ },
+ {
+ "epoch": 2.263157894736842,
+ "grad_norm": 1.174796462059021,
+ "learning_rate": 4.150049073652262e-06,
+ "loss": 0.1063,
+ "step": 258
+ },
+ {
+ "epoch": 2.2719298245614037,
+ "grad_norm": 0.5719411969184875,
+ "learning_rate": 4.1399220099514845e-06,
+ "loss": 0.1043,
+ "step": 259
+ },
+ {
+ "epoch": 2.280701754385965,
+ "grad_norm": 0.7268956303596497,
+ "learning_rate": 4.129747489696781e-06,
+ "loss": 0.1038,
+ "step": 260
+ },
+ {
+ "epoch": 2.2894736842105265,
+ "grad_norm": 0.7028316259384155,
+ "learning_rate": 4.119525807321467e-06,
+ "loss": 0.1052,
+ "step": 261
+ },
+ {
+ "epoch": 2.2982456140350878,
+ "grad_norm": 1.015335202217102,
+ "learning_rate": 4.109257258623644e-06,
+ "loss": 0.1116,
+ "step": 262
+ },
+ {
+ "epoch": 2.307017543859649,
+ "grad_norm": 0.7141755819320679,
+ "learning_rate": 4.098942140757646e-06,
+ "loss": 0.108,
+ "step": 263
+ },
+ {
+ "epoch": 2.3157894736842106,
+ "grad_norm": 0.7656403183937073,
+ "learning_rate": 4.0885807522254435e-06,
+ "loss": 0.1043,
+ "step": 264
+ },
+ {
+ "epoch": 2.324561403508772,
+ "grad_norm": 0.43293774127960205,
+ "learning_rate": 4.078173392867998e-06,
+ "loss": 0.1048,
+ "step": 265
+ },
+ {
+ "epoch": 2.3333333333333335,
+ "grad_norm": 0.6755763292312622,
+ "learning_rate": 4.0677203638565895e-06,
+ "loss": 0.1064,
+ "step": 266
+ },
+ {
+ "epoch": 2.3421052631578947,
+ "grad_norm": 0.9648827314376831,
+ "learning_rate": 4.0572219676841e-06,
+ "loss": 0.1088,
+ "step": 267
+ },
+ {
+ "epoch": 2.3508771929824563,
+ "grad_norm": 0.32724836468696594,
+ "learning_rate": 4.046678508156259e-06,
+ "loss": 0.1077,
+ "step": 268
+ },
+ {
+ "epoch": 2.3596491228070176,
+ "grad_norm": 0.4696657061576843,
+ "learning_rate": 4.036090290382855e-06,
+ "loss": 0.1067,
+ "step": 269
+ },
+ {
+ "epoch": 2.3684210526315788,
+ "grad_norm": 0.33901306986808777,
+ "learning_rate": 4.025457620768901e-06,
+ "loss": 0.105,
+ "step": 270
+ },
+ {
+ "epoch": 2.3771929824561404,
+ "grad_norm": 0.5703794360160828,
+ "learning_rate": 4.014780807005775e-06,
+ "loss": 0.1033,
+ "step": 271
+ },
+ {
+ "epoch": 2.3859649122807016,
+ "grad_norm": 0.9639355540275574,
+ "learning_rate": 4.004060158062306e-06,
+ "loss": 0.1041,
+ "step": 272
+ },
+ {
+ "epoch": 2.3947368421052633,
+ "grad_norm": 0.8851558566093445,
+ "learning_rate": 3.993295984175845e-06,
+ "loss": 0.1064,
+ "step": 273
+ },
+ {
+ "epoch": 2.4035087719298245,
+ "grad_norm": 0.5200062990188599,
+ "learning_rate": 3.982488596843276e-06,
+ "loss": 0.1056,
+ "step": 274
+ },
+ {
+ "epoch": 2.412280701754386,
+ "grad_norm": 1.160823106765747,
+ "learning_rate": 3.971638308812007e-06,
+ "loss": 0.1069,
+ "step": 275
+ },
+ {
+ "epoch": 2.4210526315789473,
+ "grad_norm": 1.0191210508346558,
+ "learning_rate": 3.9607454340709215e-06,
+ "loss": 0.1042,
+ "step": 276
+ },
+ {
+ "epoch": 2.4298245614035086,
+ "grad_norm": 0.37181487679481506,
+ "learning_rate": 3.949810287841289e-06,
+ "loss": 0.1062,
+ "step": 277
+ },
+ {
+ "epoch": 2.43859649122807,
+ "grad_norm": 0.9328593611717224,
+ "learning_rate": 3.9388331865676436e-06,
+ "loss": 0.1086,
+ "step": 278
+ },
+ {
+ "epoch": 2.4473684210526314,
+ "grad_norm": 0.8024734258651733,
+ "learning_rate": 3.927814447908625e-06,
+ "loss": 0.1051,
+ "step": 279
+ },
+ {
+ "epoch": 2.456140350877193,
+ "grad_norm": 0.9746696352958679,
+ "learning_rate": 3.916754390727795e-06,
+ "loss": 0.1041,
+ "step": 280
+ },
+ {
+ "epoch": 2.4649122807017543,
+ "grad_norm": 0.5457844138145447,
+ "learning_rate": 3.905653335084394e-06,
+ "loss": 0.1052,
+ "step": 281
+ },
+ {
+ "epoch": 2.473684210526316,
+ "grad_norm": 1.0736924409866333,
+ "learning_rate": 3.8945116022240945e-06,
+ "loss": 0.1075,
+ "step": 282
+ },
+ {
+ "epoch": 2.482456140350877,
+ "grad_norm": 0.6335628032684326,
+ "learning_rate": 3.8833295145696964e-06,
+ "loss": 0.1036,
+ "step": 283
+ },
+ {
+ "epoch": 2.4912280701754383,
+ "grad_norm": 0.6909618377685547,
+ "learning_rate": 3.872107395711799e-06,
+ "loss": 0.1089,
+ "step": 284
+ },
+ {
+ "epoch": 2.5,
+ "grad_norm": 2.1871702671051025,
+ "learning_rate": 3.860845570399435e-06,
+ "loss": 0.1066,
+ "step": 285
+ },
+ {
+ "epoch": 2.5087719298245617,
+ "grad_norm": 0.5831722617149353,
+ "learning_rate": 3.849544364530678e-06,
+ "loss": 0.1055,
+ "step": 286
+ },
+ {
+ "epoch": 2.517543859649123,
+ "grad_norm": 0.5302637815475464,
+ "learning_rate": 3.838204105143204e-06,
+ "loss": 0.1057,
+ "step": 287
+ },
+ {
+ "epoch": 2.526315789473684,
+ "grad_norm": 0.6348035931587219,
+ "learning_rate": 3.8268251204048335e-06,
+ "loss": 0.1089,
+ "step": 288
+ },
+ {
+ "epoch": 2.5350877192982457,
+ "grad_norm": 2.1932008266448975,
+ "learning_rate": 3.815407739604033e-06,
+ "loss": 0.1043,
+ "step": 289
+ },
+ {
+ "epoch": 2.543859649122807,
+ "grad_norm": 0.4388940930366516,
+ "learning_rate": 3.803952293140385e-06,
+ "loss": 0.1055,
+ "step": 290
+ },
+ {
+ "epoch": 2.5526315789473686,
+ "grad_norm": 0.6853339076042175,
+ "learning_rate": 3.7924591125150265e-06,
+ "loss": 0.1036,
+ "step": 291
+ },
+ {
+ "epoch": 2.56140350877193,
+ "grad_norm": 0.34744876623153687,
+ "learning_rate": 3.78092853032106e-06,
+ "loss": 0.1025,
+ "step": 292
+ },
+ {
+ "epoch": 2.5701754385964914,
+ "grad_norm": 0.9523847699165344,
+ "learning_rate": 3.769360880233922e-06,
+ "loss": 0.1067,
+ "step": 293
+ },
+ {
+ "epoch": 2.5789473684210527,
+ "grad_norm": 1.303745985031128,
+ "learning_rate": 3.7577564970017338e-06,
+ "loss": 0.1082,
+ "step": 294
+ },
+ {
+ "epoch": 2.587719298245614,
+ "grad_norm": 0.9468981623649597,
+ "learning_rate": 3.7461157164356103e-06,
+ "loss": 0.1055,
+ "step": 295
+ },
+ {
+ "epoch": 2.5964912280701755,
+ "grad_norm": 0.7204175591468811,
+ "learning_rate": 3.7344388753999434e-06,
+ "loss": 0.1055,
+ "step": 296
+ },
+ {
+ "epoch": 2.6052631578947367,
+ "grad_norm": 0.5110165476799011,
+ "learning_rate": 3.7227263118026537e-06,
+ "loss": 0.1092,
+ "step": 297
+ },
+ {
+ "epoch": 2.6140350877192984,
+ "grad_norm": 0.6483246088027954,
+ "learning_rate": 3.7109783645854116e-06,
+ "loss": 0.1078,
+ "step": 298
+ },
+ {
+ "epoch": 2.6228070175438596,
+ "grad_norm": 0.5058422684669495,
+ "learning_rate": 3.699195373713831e-06,
+ "loss": 0.1073,
+ "step": 299
+ },
+ {
+ "epoch": 2.6315789473684212,
+ "grad_norm": 0.4123518764972687,
+ "learning_rate": 3.6873776801676265e-06,
+ "loss": 0.1053,
+ "step": 300
+ },
+ {
+ "epoch": 2.6403508771929824,
+ "grad_norm": 1.0864709615707397,
+ "learning_rate": 3.675525625930751e-06,
+ "loss": 0.1048,
+ "step": 301
+ },
+ {
+ "epoch": 2.6491228070175437,
+ "grad_norm": 1.0264904499053955,
+ "learning_rate": 3.6636395539814975e-06,
+ "loss": 0.1059,
+ "step": 302
+ },
+ {
+ "epoch": 2.6578947368421053,
+ "grad_norm": 0.7724822163581848,
+ "learning_rate": 3.651719808282573e-06,
+ "loss": 0.1063,
+ "step": 303
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 0.7474755644798279,
+ "learning_rate": 3.6397667337711475e-06,
+ "loss": 0.1034,
+ "step": 304
+ },
+ {
+ "epoch": 2.675438596491228,
+ "grad_norm": 0.5628909468650818,
+ "learning_rate": 3.6277806763488666e-06,
+ "loss": 0.1026,
+ "step": 305
+ },
+ {
+ "epoch": 2.6842105263157894,
+ "grad_norm": 0.9070547819137573,
+ "learning_rate": 3.6157619828718477e-06,
+ "loss": 0.1031,
+ "step": 306
+ },
+ {
+ "epoch": 2.692982456140351,
+ "grad_norm": 0.6968091130256653,
+ "learning_rate": 3.603711001140641e-06,
+ "loss": 0.1068,
+ "step": 307
+ },
+ {
+ "epoch": 2.7017543859649122,
+ "grad_norm": 0.3764977753162384,
+ "learning_rate": 3.5916280798901604e-06,
+ "loss": 0.1038,
+ "step": 308
+ },
+ {
+ "epoch": 2.7105263157894735,
+ "grad_norm": 5.012625694274902,
+ "learning_rate": 3.5795135687795984e-06,
+ "loss": 0.1129,
+ "step": 309
+ },
+ {
+ "epoch": 2.719298245614035,
+ "grad_norm": 0.6745572686195374,
+ "learning_rate": 3.567367818382303e-06,
+ "loss": 0.1071,
+ "step": 310
+ },
+ {
+ "epoch": 2.7280701754385968,
+ "grad_norm": 1.0659606456756592,
+ "learning_rate": 3.555191180175634e-06,
+ "loss": 0.1067,
+ "step": 311
+ },
+ {
+ "epoch": 2.736842105263158,
+ "grad_norm": 1.7312604188919067,
+ "learning_rate": 3.5429840065307924e-06,
+ "loss": 0.1101,
+ "step": 312
+ },
+ {
+ "epoch": 2.745614035087719,
+ "grad_norm": 1.100364327430725,
+ "learning_rate": 3.5307466507026223e-06,
+ "loss": 0.1098,
+ "step": 313
+ },
+ {
+ "epoch": 2.754385964912281,
+ "grad_norm": 1.0390428304672241,
+ "learning_rate": 3.5184794668193893e-06,
+ "loss": 0.1094,
+ "step": 314
+ },
+ {
+ "epoch": 2.763157894736842,
+ "grad_norm": 0.3369971811771393,
+ "learning_rate": 3.5061828098725327e-06,
+ "loss": 0.1053,
+ "step": 315
+ },
+ {
+ "epoch": 2.7719298245614032,
+ "grad_norm": 0.6130257248878479,
+ "learning_rate": 3.4938570357063906e-06,
+ "loss": 0.106,
+ "step": 316
+ },
+ {
+ "epoch": 2.780701754385965,
+ "grad_norm": 0.6387595534324646,
+ "learning_rate": 3.481502501007904e-06,
+ "loss": 0.1044,
+ "step": 317
+ },
+ {
+ "epoch": 2.7894736842105265,
+ "grad_norm": 1.0731587409973145,
+ "learning_rate": 3.469119563296296e-06,
+ "loss": 0.1097,
+ "step": 318
+ },
+ {
+ "epoch": 2.7982456140350878,
+ "grad_norm": 0.8096229434013367,
+ "learning_rate": 3.4567085809127247e-06,
+ "loss": 0.1076,
+ "step": 319
+ },
+ {
+ "epoch": 2.807017543859649,
+ "grad_norm": 0.5034844279289246,
+ "learning_rate": 3.444269913009912e-06,
+ "loss": 0.1071,
+ "step": 320
+ },
+ {
+ "epoch": 2.8157894736842106,
+ "grad_norm": 0.675139307975769,
+ "learning_rate": 3.4318039195417536e-06,
+ "loss": 0.1039,
+ "step": 321
+ },
+ {
+ "epoch": 2.824561403508772,
+ "grad_norm": 0.7330355644226074,
+ "learning_rate": 3.4193109612528972e-06,
+ "loss": 0.1044,
+ "step": 322
+ },
+ {
+ "epoch": 2.8333333333333335,
+ "grad_norm": 0.6558271646499634,
+ "learning_rate": 3.4067913996683115e-06,
+ "loss": 0.1051,
+ "step": 323
+ },
+ {
+ "epoch": 2.8421052631578947,
+ "grad_norm": 0.8411844372749329,
+ "learning_rate": 3.3942455970828146e-06,
+ "loss": 0.1063,
+ "step": 324
+ },
+ {
+ "epoch": 2.8508771929824563,
+ "grad_norm": 0.4817325174808502,
+ "learning_rate": 3.3816739165505964e-06,
+ "loss": 0.105,
+ "step": 325
+ },
+ {
+ "epoch": 2.8596491228070176,
+ "grad_norm": 0.424554705619812,
+ "learning_rate": 3.3690767218747104e-06,
+ "loss": 0.1037,
+ "step": 326
+ },
+ {
+ "epoch": 2.8684210526315788,
+ "grad_norm": 1.0054417848587036,
+ "learning_rate": 3.3564543775965475e-06,
+ "loss": 0.1058,
+ "step": 327
+ },
+ {
+ "epoch": 2.8771929824561404,
+ "grad_norm": 0.8984584808349609,
+ "learning_rate": 3.3438072489852837e-06,
+ "loss": 0.1079,
+ "step": 328
+ },
+ {
+ "epoch": 2.8859649122807016,
+ "grad_norm": 0.6779558062553406,
+ "learning_rate": 3.331135702027311e-06,
+ "loss": 0.1046,
+ "step": 329
+ },
+ {
+ "epoch": 2.8947368421052633,
+ "grad_norm": 0.6931657195091248,
+ "learning_rate": 3.318440103415649e-06,
+ "loss": 0.1106,
+ "step": 330
+ },
+ {
+ "epoch": 2.9035087719298245,
+ "grad_norm": 0.705264151096344,
+ "learning_rate": 3.305720820539329e-06,
+ "loss": 0.104,
+ "step": 331
+ },
+ {
+ "epoch": 2.912280701754386,
+ "grad_norm": 0.7799407839775085,
+ "learning_rate": 3.2929782214727657e-06,
+ "loss": 0.1019,
+ "step": 332
+ },
+ {
+ "epoch": 2.9210526315789473,
+ "grad_norm": 0.7583760619163513,
+ "learning_rate": 3.2802126749651042e-06,
+ "loss": 0.1049,
+ "step": 333
+ },
+ {
+ "epoch": 2.9298245614035086,
+ "grad_norm": 0.6145837306976318,
+ "learning_rate": 3.2674245504295505e-06,
+ "loss": 0.104,
+ "step": 334
+ },
+ {
+ "epoch": 2.93859649122807,
+ "grad_norm": 0.5170779228210449,
+ "learning_rate": 3.254614217932679e-06,
+ "loss": 0.1024,
+ "step": 335
+ },
+ {
+ "epoch": 2.9473684210526314,
+ "grad_norm": 0.6850940585136414,
+ "learning_rate": 3.241782048183726e-06,
+ "loss": 0.1047,
+ "step": 336
+ },
+ {
+ "epoch": 2.956140350877193,
+ "grad_norm": 0.7307694554328918,
+ "learning_rate": 3.2289284125238597e-06,
+ "loss": 0.1032,
+ "step": 337
+ },
+ {
+ "epoch": 2.9649122807017543,
+ "grad_norm": 0.3386179208755493,
+ "learning_rate": 3.216053682915436e-06,
+ "loss": 0.1037,
+ "step": 338
+ },
+ {
+ "epoch": 2.973684210526316,
+ "grad_norm": 0.7565059065818787,
+ "learning_rate": 3.203158231931234e-06,
+ "loss": 0.1048,
+ "step": 339
+ },
+ {
+ "epoch": 2.982456140350877,
+ "grad_norm": 0.7902039289474487,
+ "learning_rate": 3.190242432743673e-06,
+ "loss": 0.1068,
+ "step": 340
+ },
+ {
+ "epoch": 2.9912280701754383,
+ "grad_norm": 0.42595192790031433,
+ "learning_rate": 3.177306659114015e-06,
+ "loss": 0.1039,
+ "step": 341
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.1214542388916016,
+ "learning_rate": 3.164351285381549e-06,
+ "loss": 0.1062,
+ "step": 342
+ },
+ {
+ "epoch": 3.008771929824561,
+ "grad_norm": 0.7622955441474915,
+ "learning_rate": 3.1513766864527577e-06,
+ "loss": 0.1015,
+ "step": 343
+ },
+ {
+ "epoch": 3.017543859649123,
+ "grad_norm": 0.2676297724246979,
+ "learning_rate": 3.1383832377904676e-06,
+ "loss": 0.1037,
+ "step": 344
+ },
+ {
+ "epoch": 3.026315789473684,
+ "grad_norm": 0.8695605397224426,
+ "learning_rate": 3.1253713154029857e-06,
+ "loss": 0.1056,
+ "step": 345
+ },
+ {
+ "epoch": 3.0350877192982457,
+ "grad_norm": 0.5875906944274902,
+ "learning_rate": 3.1123412958332155e-06,
+ "loss": 0.1067,
+ "step": 346
+ },
+ {
+ "epoch": 3.043859649122807,
+ "grad_norm": 0.7699372172355652,
+ "learning_rate": 3.0992935561477632e-06,
+ "loss": 0.1035,
+ "step": 347
+ },
+ {
+ "epoch": 3.0526315789473686,
+ "grad_norm": 0.5919204354286194,
+ "learning_rate": 3.0862284739260247e-06,
+ "loss": 0.1023,
+ "step": 348
+ },
+ {
+ "epoch": 3.06140350877193,
+ "grad_norm": 1.3211849927902222,
+ "learning_rate": 3.07314642724926e-06,
+ "loss": 0.1065,
+ "step": 349
+ },
+ {
+ "epoch": 3.0701754385964914,
+ "grad_norm": 0.6359637379646301,
+ "learning_rate": 3.0600477946896494e-06,
+ "loss": 0.106,
+ "step": 350
+ },
+ {
+ "epoch": 3.0789473684210527,
+ "grad_norm": 0.35776662826538086,
+ "learning_rate": 3.046932955299344e-06,
+ "loss": 0.1046,
+ "step": 351
+ },
+ {
+ "epoch": 3.087719298245614,
+ "grad_norm": 0.6657406687736511,
+ "learning_rate": 3.0338022885994904e-06,
+ "loss": 0.1076,
+ "step": 352
+ },
+ {
+ "epoch": 3.0964912280701755,
+ "grad_norm": 0.7587785720825195,
+ "learning_rate": 3.0206561745692512e-06,
+ "loss": 0.1043,
+ "step": 353
+ },
+ {
+ "epoch": 3.1052631578947367,
+ "grad_norm": 1.1258317232131958,
+ "learning_rate": 3.0074949936348084e-06,
+ "loss": 0.1043,
+ "step": 354
+ },
+ {
+ "epoch": 3.1140350877192984,
+ "grad_norm": 0.3570568263530731,
+ "learning_rate": 2.9943191266583564e-06,
+ "loss": 0.1032,
+ "step": 355
+ },
+ {
+ "epoch": 3.1228070175438596,
+ "grad_norm": 0.843485414981842,
+ "learning_rate": 2.981128954927075e-06,
+ "loss": 0.1045,
+ "step": 356
+ },
+ {
+ "epoch": 3.1315789473684212,
+ "grad_norm": 0.5719651579856873,
+ "learning_rate": 2.967924860142103e-06,
+ "loss": 0.1052,
+ "step": 357
+ },
+ {
+ "epoch": 3.1403508771929824,
+ "grad_norm": 2.20767879486084,
+ "learning_rate": 2.9547072244074853e-06,
+ "loss": 0.1078,
+ "step": 358
+ },
+ {
+ "epoch": 3.1491228070175437,
+ "grad_norm": 0.3715457022190094,
+ "learning_rate": 2.941476430219122e-06,
+ "loss": 0.1047,
+ "step": 359
+ },
+ {
+ "epoch": 3.1578947368421053,
+ "grad_norm": 0.7803200483322144,
+ "learning_rate": 2.928232860453694e-06,
+ "loss": 0.1029,
+ "step": 360
+ },
+ {
+ "epoch": 3.1666666666666665,
+ "grad_norm": 0.5198164582252502,
+ "learning_rate": 2.9149768983575884e-06,
+ "loss": 0.1032,
+ "step": 361
+ },
+ {
+ "epoch": 3.175438596491228,
+ "grad_norm": 0.7827185988426208,
+ "learning_rate": 2.9017089275358017e-06,
+ "loss": 0.1043,
+ "step": 362
+ },
+ {
+ "epoch": 3.1842105263157894,
+ "grad_norm": 0.4000351130962372,
+ "learning_rate": 2.8884293319408464e-06,
+ "loss": 0.1071,
+ "step": 363
+ },
+ {
+ "epoch": 3.192982456140351,
+ "grad_norm": 0.9913386106491089,
+ "learning_rate": 2.8751384958616318e-06,
+ "loss": 0.1022,
+ "step": 364
+ },
+ {
+ "epoch": 3.2017543859649122,
+ "grad_norm": 0.6975695490837097,
+ "learning_rate": 2.861836803912353e-06,
+ "loss": 0.1029,
+ "step": 365
+ },
+ {
+ "epoch": 3.2105263157894735,
+ "grad_norm": 0.2372695654630661,
+ "learning_rate": 2.8485246410213497e-06,
+ "loss": 0.1015,
+ "step": 366
+ },
+ {
+ "epoch": 3.219298245614035,
+ "grad_norm": 0.447732537984848,
+ "learning_rate": 2.835202392419977e-06,
+ "loss": 0.1052,
+ "step": 367
+ },
+ {
+ "epoch": 3.2280701754385963,
+ "grad_norm": 0.6617346405982971,
+ "learning_rate": 2.8218704436314525e-06,
+ "loss": 0.1055,
+ "step": 368
+ },
+ {
+ "epoch": 3.236842105263158,
+ "grad_norm": 0.5550402402877808,
+ "learning_rate": 2.8085291804596995e-06,
+ "loss": 0.102,
+ "step": 369
+ },
+ {
+ "epoch": 3.245614035087719,
+ "grad_norm": 0.6046020984649658,
+ "learning_rate": 2.795178988978185e-06,
+ "loss": 0.1036,
+ "step": 370
+ },
+ {
+ "epoch": 3.254385964912281,
+ "grad_norm": 0.41890618205070496,
+ "learning_rate": 2.781820255518745e-06,
+ "loss": 0.1036,
+ "step": 371
+ },
+ {
+ "epoch": 3.263157894736842,
+ "grad_norm": 0.8387415409088135,
+ "learning_rate": 2.768453366660408e-06,
+ "loss": 0.1076,
+ "step": 372
+ },
+ {
+ "epoch": 3.2719298245614037,
+ "grad_norm": 0.5318773984909058,
+ "learning_rate": 2.755078709218203e-06,
+ "loss": 0.1052,
+ "step": 373
+ },
+ {
+ "epoch": 3.280701754385965,
+ "grad_norm": 0.6617523431777954,
+ "learning_rate": 2.741696670231969e-06,
+ "loss": 0.1049,
+ "step": 374
+ },
+ {
+ "epoch": 3.2894736842105265,
+ "grad_norm": 1.0190025568008423,
+ "learning_rate": 2.728307636955156e-06,
+ "loss": 0.1034,
+ "step": 375
+ },
+ {
+ "epoch": 3.2982456140350878,
+ "grad_norm": 0.6924716234207153,
+ "learning_rate": 2.714911996843617e-06,
+ "loss": 0.1065,
+ "step": 376
+ },
+ {
+ "epoch": 3.307017543859649,
+ "grad_norm": 0.42501118779182434,
+ "learning_rate": 2.701510137544393e-06,
+ "loss": 0.1019,
+ "step": 377
+ },
+ {
+ "epoch": 3.3157894736842106,
+ "grad_norm": 0.844886064529419,
+ "learning_rate": 2.6881024468845e-06,
+ "loss": 0.1047,
+ "step": 378
+ },
+ {
+ "epoch": 3.324561403508772,
+ "grad_norm": 0.46512728929519653,
+ "learning_rate": 2.674689312859704e-06,
+ "loss": 0.1043,
+ "step": 379
+ },
+ {
+ "epoch": 3.3333333333333335,
+ "grad_norm": 0.6242017149925232,
+ "learning_rate": 2.6612711236232915e-06,
+ "loss": 0.1046,
+ "step": 380
+ },
+ {
+ "epoch": 3.3421052631578947,
+ "grad_norm": 0.6578526496887207,
+ "learning_rate": 2.6478482674748375e-06,
+ "loss": 0.1031,
+ "step": 381
+ },
+ {
+ "epoch": 3.3508771929824563,
+ "grad_norm": 0.4822542667388916,
+ "learning_rate": 2.63442113284897e-06,
+ "loss": 0.1053,
+ "step": 382
+ },
+ {
+ "epoch": 3.3596491228070176,
+ "grad_norm": 0.48255595564842224,
+ "learning_rate": 2.6209901083041307e-06,
+ "loss": 0.1058,
+ "step": 383
+ },
+ {
+ "epoch": 3.3684210526315788,
+ "grad_norm": 0.6624025702476501,
+ "learning_rate": 2.6075555825113265e-06,
+ "loss": 0.1066,
+ "step": 384
+ },
+ {
+ "epoch": 3.3771929824561404,
+ "grad_norm": 0.6962618827819824,
+ "learning_rate": 2.5941179442428864e-06,
+ "loss": 0.102,
+ "step": 385
+ },
+ {
+ "epoch": 3.3859649122807016,
+ "grad_norm": 0.4976450502872467,
+ "learning_rate": 2.580677582361208e-06,
+ "loss": 0.1011,
+ "step": 386
+ },
+ {
+ "epoch": 3.3947368421052633,
+ "grad_norm": 0.5283737182617188,
+ "learning_rate": 2.5672348858075053e-06,
+ "loss": 0.1057,
+ "step": 387
+ },
+ {
+ "epoch": 3.4035087719298245,
+ "grad_norm": 0.32338738441467285,
+ "learning_rate": 2.553790243590556e-06,
+ "loss": 0.1015,
+ "step": 388
+ },
+ {
+ "epoch": 3.412280701754386,
+ "grad_norm": 0.7909435629844666,
+ "learning_rate": 2.5403440447754385e-06,
+ "loss": 0.1036,
+ "step": 389
+ },
+ {
+ "epoch": 3.4210526315789473,
+ "grad_norm": 0.6297115087509155,
+ "learning_rate": 2.5268966784722792e-06,
+ "loss": 0.1042,
+ "step": 390
+ },
+ {
+ "epoch": 3.4298245614035086,
+ "grad_norm": 0.32988762855529785,
+ "learning_rate": 2.513448533824988e-06,
+ "loss": 0.1059,
+ "step": 391
+ },
+ {
+ "epoch": 3.43859649122807,
+ "grad_norm": 0.9211220145225525,
+ "learning_rate": 2.5e-06,
+ "loss": 0.1015,
+ "step": 392
+ },
+ {
+ "epoch": 3.4473684210526314,
+ "grad_norm": 1.2157588005065918,
+ "learning_rate": 2.486551466175013e-06,
+ "loss": 0.1035,
+ "step": 393
+ },
+ {
+ "epoch": 3.456140350877193,
+ "grad_norm": 0.4786648452281952,
+ "learning_rate": 2.4731033215277216e-06,
+ "loss": 0.1026,
+ "step": 394
+ },
+ {
+ "epoch": 3.4649122807017543,
+ "grad_norm": 0.37398242950439453,
+ "learning_rate": 2.4596559552245623e-06,
+ "loss": 0.1044,
+ "step": 395
+ },
+ {
+ "epoch": 3.473684210526316,
+ "grad_norm": 0.5536217093467712,
+ "learning_rate": 2.446209756409445e-06,
+ "loss": 0.1043,
+ "step": 396
+ },
+ {
+ "epoch": 3.482456140350877,
+ "grad_norm": 0.708406925201416,
+ "learning_rate": 2.432765114192495e-06,
+ "loss": 0.1046,
+ "step": 397
+ },
+ {
+ "epoch": 3.4912280701754383,
+ "grad_norm": 0.7140893340110779,
+ "learning_rate": 2.4193224176387926e-06,
+ "loss": 0.1039,
+ "step": 398
+ },
+ {
+ "epoch": 3.5,
+ "grad_norm": 0.8078088760375977,
+ "learning_rate": 2.4058820557571144e-06,
+ "loss": 0.1013,
+ "step": 399
+ },
+ {
+ "epoch": 3.5087719298245617,
+ "grad_norm": 0.7129591107368469,
+ "learning_rate": 2.3924444174886735e-06,
+ "loss": 0.1057,
+ "step": 400
+ },
+ {
+ "epoch": 3.517543859649123,
+ "grad_norm": 1.293412446975708,
+ "learning_rate": 2.37900989169587e-06,
+ "loss": 0.1081,
+ "step": 401
+ },
+ {
+ "epoch": 3.526315789473684,
+ "grad_norm": 0.7235314249992371,
+ "learning_rate": 2.3655788671510314e-06,
+ "loss": 0.1054,
+ "step": 402
+ },
+ {
+ "epoch": 3.5350877192982457,
+ "grad_norm": 0.6008841395378113,
+ "learning_rate": 2.3521517325251637e-06,
+ "loss": 0.1033,
+ "step": 403
+ },
+ {
+ "epoch": 3.543859649122807,
+ "grad_norm": 0.6819609999656677,
+ "learning_rate": 2.3387288763767097e-06,
+ "loss": 0.1019,
+ "step": 404
+ },
+ {
+ "epoch": 3.5526315789473686,
+ "grad_norm": 0.5696406960487366,
+ "learning_rate": 2.325310687140296e-06,
+ "loss": 0.1043,
+ "step": 405
+ },
+ {
+ "epoch": 3.56140350877193,
+ "grad_norm": 0.8597077131271362,
+ "learning_rate": 2.3118975531155003e-06,
+ "loss": 0.1037,
+ "step": 406
+ },
+ {
+ "epoch": 3.5701754385964914,
+ "grad_norm": 0.43985217809677124,
+ "learning_rate": 2.2984898624556075e-06,
+ "loss": 0.105,
+ "step": 407
+ },
+ {
+ "epoch": 3.5789473684210527,
+ "grad_norm": 0.5448469519615173,
+ "learning_rate": 2.2850880031563845e-06,
+ "loss": 0.1037,
+ "step": 408
+ },
+ {
+ "epoch": 3.587719298245614,
+ "grad_norm": 0.8221977949142456,
+ "learning_rate": 2.271692363044845e-06,
+ "loss": 0.1015,
+ "step": 409
+ },
+ {
+ "epoch": 3.5964912280701755,
+ "grad_norm": 0.9838594198226929,
+ "learning_rate": 2.2583033297680316e-06,
+ "loss": 0.1085,
+ "step": 410
+ },
+ {
+ "epoch": 3.6052631578947367,
+ "grad_norm": 1.034848928451538,
+ "learning_rate": 2.2449212907817985e-06,
+ "loss": 0.104,
+ "step": 411
+ },
+ {
+ "epoch": 3.6140350877192984,
+ "grad_norm": 1.0788371562957764,
+ "learning_rate": 2.2315466333395927e-06,
+ "loss": 0.1033,
+ "step": 412
+ },
+ {
+ "epoch": 3.6228070175438596,
+ "grad_norm": 0.49096915125846863,
+ "learning_rate": 2.2181797444812557e-06,
+ "loss": 0.1044,
+ "step": 413
+ },
+ {
+ "epoch": 3.6315789473684212,
+ "grad_norm": 1.309685230255127,
+ "learning_rate": 2.204821011021815e-06,
+ "loss": 0.1036,
+ "step": 414
+ },
+ {
+ "epoch": 3.6403508771929824,
+ "grad_norm": 0.5014146566390991,
+ "learning_rate": 2.191470819540301e-06,
+ "loss": 0.104,
+ "step": 415
+ },
+ {
+ "epoch": 3.6491228070175437,
+ "grad_norm": 0.770470380783081,
+ "learning_rate": 2.178129556368548e-06,
+ "loss": 0.1049,
+ "step": 416
+ },
+ {
+ "epoch": 3.6578947368421053,
+ "grad_norm": 0.4639376699924469,
+ "learning_rate": 2.1647976075800235e-06,
+ "loss": 0.1047,
+ "step": 417
+ },
+ {
+ "epoch": 3.6666666666666665,
+ "grad_norm": 1.101885437965393,
+ "learning_rate": 2.151475358978652e-06,
+ "loss": 0.1035,
+ "step": 418
+ },
+ {
+ "epoch": 3.675438596491228,
+ "grad_norm": 0.5644329786300659,
+ "learning_rate": 2.138163196087648e-06,
+ "loss": 0.103,
+ "step": 419
+ },
+ {
+ "epoch": 3.6842105263157894,
+ "grad_norm": 1.1015008687973022,
+ "learning_rate": 2.1248615041383686e-06,
+ "loss": 0.1054,
+ "step": 420
+ },
+ {
+ "epoch": 3.692982456140351,
+ "grad_norm": 0.7311366200447083,
+ "learning_rate": 2.111570668059155e-06,
+ "loss": 0.1043,
+ "step": 421
+ },
+ {
+ "epoch": 3.7017543859649122,
+ "grad_norm": 0.38242173194885254,
+ "learning_rate": 2.098291072464199e-06,
+ "loss": 0.1041,
+ "step": 422
+ },
+ {
+ "epoch": 3.7105263157894735,
+ "grad_norm": 1.231512188911438,
+ "learning_rate": 2.085023101642412e-06,
+ "loss": 0.1021,
+ "step": 423
+ },
+ {
+ "epoch": 3.719298245614035,
+ "grad_norm": 0.41761213541030884,
+ "learning_rate": 2.0717671395463063e-06,
+ "loss": 0.1062,
+ "step": 424
+ },
+ {
+ "epoch": 3.7280701754385968,
+ "grad_norm": 0.4593309462070465,
+ "learning_rate": 2.0585235697808794e-06,
+ "loss": 0.1012,
+ "step": 425
+ },
+ {
+ "epoch": 3.736842105263158,
+ "grad_norm": 0.9147135019302368,
+ "learning_rate": 2.0452927755925155e-06,
+ "loss": 0.1046,
+ "step": 426
+ },
+ {
+ "epoch": 3.745614035087719,
+ "grad_norm": 0.39639535546302795,
+ "learning_rate": 2.0320751398578984e-06,
+ "loss": 0.1018,
+ "step": 427
+ },
+ {
+ "epoch": 3.754385964912281,
+ "grad_norm": 0.688010573387146,
+ "learning_rate": 2.0188710450729255e-06,
+ "loss": 0.104,
+ "step": 428
+ },
+ {
+ "epoch": 3.763157894736842,
+ "grad_norm": 0.5140353441238403,
+ "learning_rate": 2.005680873341644e-06,
+ "loss": 0.1033,
+ "step": 429
+ },
+ {
+ "epoch": 3.7719298245614032,
+ "grad_norm": 0.5970481634140015,
+ "learning_rate": 1.992505006365191e-06,
+ "loss": 0.1044,
+ "step": 430
+ },
+ {
+ "epoch": 3.780701754385965,
+ "grad_norm": 0.551162838935852,
+ "learning_rate": 1.9793438254307496e-06,
+ "loss": 0.1042,
+ "step": 431
+ },
+ {
+ "epoch": 3.7894736842105265,
+ "grad_norm": 0.5344637632369995,
+ "learning_rate": 1.96619771140051e-06,
+ "loss": 0.1042,
+ "step": 432
+ },
+ {
+ "epoch": 3.7982456140350878,
+ "grad_norm": 0.5357667207717896,
+ "learning_rate": 1.9530670447006566e-06,
+ "loss": 0.101,
+ "step": 433
+ },
+ {
+ "epoch": 3.807017543859649,
+ "grad_norm": 1.2536660432815552,
+ "learning_rate": 1.9399522053103514e-06,
+ "loss": 0.1008,
+ "step": 434
+ },
+ {
+ "epoch": 3.8157894736842106,
+ "grad_norm": 0.4888289272785187,
+ "learning_rate": 1.926853572750741e-06,
+ "loss": 0.1028,
+ "step": 435
+ },
+ {
+ "epoch": 3.824561403508772,
+ "grad_norm": 0.5810404419898987,
+ "learning_rate": 1.913771526073976e-06,
+ "loss": 0.1031,
+ "step": 436
+ },
+ {
+ "epoch": 3.8333333333333335,
+ "grad_norm": 0.5372979044914246,
+ "learning_rate": 1.9007064438522374e-06,
+ "loss": 0.107,
+ "step": 437
+ },
+ {
+ "epoch": 3.8421052631578947,
+ "grad_norm": 0.8293616771697998,
+ "learning_rate": 1.8876587041667855e-06,
+ "loss": 0.1033,
+ "step": 438
+ },
+ {
+ "epoch": 3.8508771929824563,
+ "grad_norm": 2.361504554748535,
+ "learning_rate": 1.8746286845970145e-06,
+ "loss": 0.1098,
+ "step": 439
+ },
+ {
+ "epoch": 3.8596491228070176,
+ "grad_norm": 0.70230633020401,
+ "learning_rate": 1.8616167622095328e-06,
+ "loss": 0.1034,
+ "step": 440
+ },
+ {
+ "epoch": 3.8684210526315788,
+ "grad_norm": 0.6323564052581787,
+ "learning_rate": 1.8486233135472436e-06,
+ "loss": 0.1058,
+ "step": 441
+ },
+ {
+ "epoch": 3.8771929824561404,
+ "grad_norm": 0.48205408453941345,
+ "learning_rate": 1.8356487146184517e-06,
+ "loss": 0.105,
+ "step": 442
+ },
+ {
+ "epoch": 3.8859649122807016,
+ "grad_norm": 0.6996872425079346,
+ "learning_rate": 1.8226933408859864e-06,
+ "loss": 0.1083,
+ "step": 443
+ },
+ {
+ "epoch": 3.8947368421052633,
+ "grad_norm": 0.4114651679992676,
+ "learning_rate": 1.8097575672563278e-06,
+ "loss": 0.1003,
+ "step": 444
+ },
+ {
+ "epoch": 3.9035087719298245,
+ "grad_norm": 0.5234648585319519,
+ "learning_rate": 1.7968417680687666e-06,
+ "loss": 0.1019,
+ "step": 445
+ },
+ {
+ "epoch": 3.912280701754386,
+ "grad_norm": 1.0571491718292236,
+ "learning_rate": 1.7839463170845641e-06,
+ "loss": 0.1003,
+ "step": 446
+ },
+ {
+ "epoch": 3.9210526315789473,
+ "grad_norm": 0.7470094561576843,
+ "learning_rate": 1.7710715874761408e-06,
+ "loss": 0.1061,
+ "step": 447
+ },
+ {
+ "epoch": 3.9298245614035086,
+ "grad_norm": 0.901695191860199,
+ "learning_rate": 1.7582179518162742e-06,
+ "loss": 0.1015,
+ "step": 448
+ },
+ {
+ "epoch": 3.93859649122807,
+ "grad_norm": 1.0251179933547974,
+ "learning_rate": 1.7453857820673215e-06,
+ "loss": 0.1,
+ "step": 449
+ },
+ {
+ "epoch": 3.9473684210526314,
+ "grad_norm": 0.5065406560897827,
+ "learning_rate": 1.7325754495704508e-06,
+ "loss": 0.1036,
+ "step": 450
+ },
+ {
+ "epoch": 3.956140350877193,
+ "grad_norm": 0.9541155099868774,
+ "learning_rate": 1.7197873250348962e-06,
+ "loss": 0.1015,
+ "step": 451
+ },
+ {
+ "epoch": 3.9649122807017543,
+ "grad_norm": 0.6264199018478394,
+ "learning_rate": 1.7070217785272354e-06,
+ "loss": 0.1026,
+ "step": 452
+ },
+ {
+ "epoch": 3.973684210526316,
+ "grad_norm": 0.6260526180267334,
+ "learning_rate": 1.6942791794606716e-06,
+ "loss": 0.1039,
+ "step": 453
+ },
+ {
+ "epoch": 3.982456140350877,
+ "grad_norm": 0.4730931222438812,
+ "learning_rate": 1.681559896584352e-06,
+ "loss": 0.1045,
+ "step": 454
+ },
+ {
+ "epoch": 3.9912280701754383,
+ "grad_norm": 0.5011451840400696,
+ "learning_rate": 1.668864297972689e-06,
+ "loss": 0.1062,
+ "step": 455
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 1.0113046169281006,
+ "learning_rate": 1.6561927510147172e-06,
+ "loss": 0.1005,
+ "step": 456
+ },
+ {
+ "epoch": 4.008771929824562,
+ "grad_norm": 0.6017364263534546,
+ "learning_rate": 1.6435456224034536e-06,
+ "loss": 0.1042,
+ "step": 457
+ },
+ {
+ "epoch": 4.017543859649122,
+ "grad_norm": 0.6874931454658508,
+ "learning_rate": 1.63092327812529e-06,
+ "loss": 0.102,
+ "step": 458
+ },
+ {
+ "epoch": 4.026315789473684,
+ "grad_norm": 1.311024785041809,
+ "learning_rate": 1.6183260834494053e-06,
+ "loss": 0.1063,
+ "step": 459
+ },
+ {
+ "epoch": 4.035087719298246,
+ "grad_norm": 0.3640352785587311,
+ "learning_rate": 1.6057544029171863e-06,
+ "loss": 0.1039,
+ "step": 460
+ },
+ {
+ "epoch": 4.043859649122807,
+ "grad_norm": 0.6056526303291321,
+ "learning_rate": 1.5932086003316893e-06,
+ "loss": 0.099,
+ "step": 461
+ },
+ {
+ "epoch": 4.052631578947368,
+ "grad_norm": 0.5407683849334717,
+ "learning_rate": 1.5806890387471025e-06,
+ "loss": 0.1038,
+ "step": 462
+ },
+ {
+ "epoch": 4.06140350877193,
+ "grad_norm": 0.7054030895233154,
+ "learning_rate": 1.5681960804582474e-06,
+ "loss": 0.1001,
+ "step": 463
+ },
+ {
+ "epoch": 4.0701754385964914,
+ "grad_norm": 0.8736140727996826,
+ "learning_rate": 1.5557300869900876e-06,
+ "loss": 0.1035,
+ "step": 464
+ },
+ {
+ "epoch": 4.078947368421052,
+ "grad_norm": 0.6689419746398926,
+ "learning_rate": 1.5432914190872757e-06,
+ "loss": 0.1052,
+ "step": 465
+ },
+ {
+ "epoch": 4.087719298245614,
+ "grad_norm": 0.8937819600105286,
+ "learning_rate": 1.530880436703705e-06,
+ "loss": 0.1024,
+ "step": 466
+ },
+ {
+ "epoch": 4.0964912280701755,
+ "grad_norm": 0.24332484602928162,
+ "learning_rate": 1.518497498992097e-06,
+ "loss": 0.0984,
+ "step": 467
+ },
+ {
+ "epoch": 4.105263157894737,
+ "grad_norm": 0.9716914296150208,
+ "learning_rate": 1.5061429642936107e-06,
+ "loss": 0.1012,
+ "step": 468
+ },
+ {
+ "epoch": 4.114035087719298,
+ "grad_norm": 0.5864392518997192,
+ "learning_rate": 1.4938171901274678e-06,
+ "loss": 0.1029,
+ "step": 469
+ },
+ {
+ "epoch": 4.12280701754386,
+ "grad_norm": 0.4616212546825409,
+ "learning_rate": 1.4815205331806113e-06,
+ "loss": 0.1035,
+ "step": 470
+ },
+ {
+ "epoch": 4.131578947368421,
+ "grad_norm": 0.5989730954170227,
+ "learning_rate": 1.4692533492973775e-06,
+ "loss": 0.1036,
+ "step": 471
+ },
+ {
+ "epoch": 4.140350877192983,
+ "grad_norm": 0.7900629639625549,
+ "learning_rate": 1.4570159934692085e-06,
+ "loss": 0.1044,
+ "step": 472
+ },
+ {
+ "epoch": 4.149122807017544,
+ "grad_norm": 0.5659995675086975,
+ "learning_rate": 1.4448088198243668e-06,
+ "loss": 0.1024,
+ "step": 473
+ },
+ {
+ "epoch": 4.157894736842105,
+ "grad_norm": 0.7867873311042786,
+ "learning_rate": 1.432632181617698e-06,
+ "loss": 0.1038,
+ "step": 474
+ },
+ {
+ "epoch": 4.166666666666667,
+ "grad_norm": 0.44385358691215515,
+ "learning_rate": 1.4204864312204033e-06,
+ "loss": 0.1006,
+ "step": 475
+ },
+ {
+ "epoch": 4.175438596491228,
+ "grad_norm": 0.3909265697002411,
+ "learning_rate": 1.4083719201098404e-06,
+ "loss": 0.1019,
+ "step": 476
+ },
+ {
+ "epoch": 4.184210526315789,
+ "grad_norm": 0.7079223990440369,
+ "learning_rate": 1.3962889988593609e-06,
+ "loss": 0.1019,
+ "step": 477
+ },
+ {
+ "epoch": 4.192982456140351,
+ "grad_norm": 0.6703695058822632,
+ "learning_rate": 1.3842380171281522e-06,
+ "loss": 0.1063,
+ "step": 478
+ },
+ {
+ "epoch": 4.201754385964913,
+ "grad_norm": 0.3477051556110382,
+ "learning_rate": 1.3722193236511344e-06,
+ "loss": 0.1004,
+ "step": 479
+ },
+ {
+ "epoch": 4.2105263157894735,
+ "grad_norm": 0.7296048402786255,
+ "learning_rate": 1.3602332662288536e-06,
+ "loss": 0.1057,
+ "step": 480
+ },
+ {
+ "epoch": 4.219298245614035,
+ "grad_norm": 0.7007803916931152,
+ "learning_rate": 1.348280191717427e-06,
+ "loss": 0.1007,
+ "step": 481
+ },
+ {
+ "epoch": 4.228070175438597,
+ "grad_norm": 0.948968231678009,
+ "learning_rate": 1.3363604460185031e-06,
+ "loss": 0.1005,
+ "step": 482
+ },
+ {
+ "epoch": 4.2368421052631575,
+ "grad_norm": 0.6567812561988831,
+ "learning_rate": 1.3244743740692496e-06,
+ "loss": 0.1016,
+ "step": 483
+ },
+ {
+ "epoch": 4.245614035087719,
+ "grad_norm": 0.5390146374702454,
+ "learning_rate": 1.3126223198323752e-06,
+ "loss": 0.1025,
+ "step": 484
+ },
+ {
+ "epoch": 4.254385964912281,
+ "grad_norm": 0.43638724088668823,
+ "learning_rate": 1.3008046262861696e-06,
+ "loss": 0.1053,
+ "step": 485
+ },
+ {
+ "epoch": 4.2631578947368425,
+ "grad_norm": 0.43589839339256287,
+ "learning_rate": 1.289021635414589e-06,
+ "loss": 0.1036,
+ "step": 486
+ },
+ {
+ "epoch": 4.271929824561403,
+ "grad_norm": 0.3999694585800171,
+ "learning_rate": 1.277273688197346e-06,
+ "loss": 0.1023,
+ "step": 487
+ },
+ {
+ "epoch": 4.280701754385965,
+ "grad_norm": 0.6314297914505005,
+ "learning_rate": 1.265561124600057e-06,
+ "loss": 0.0993,
+ "step": 488
+ },
+ {
+ "epoch": 4.2894736842105265,
+ "grad_norm": 0.566033124923706,
+ "learning_rate": 1.2538842835643906e-06,
+ "loss": 0.1029,
+ "step": 489
+ },
+ {
+ "epoch": 4.298245614035087,
+ "grad_norm": 0.6713336110115051,
+ "learning_rate": 1.2422435029982669e-06,
+ "loss": 0.1002,
+ "step": 490
+ },
+ {
+ "epoch": 4.307017543859649,
+ "grad_norm": 0.428574800491333,
+ "learning_rate": 1.2306391197660797e-06,
+ "loss": 0.1028,
+ "step": 491
+ },
+ {
+ "epoch": 4.315789473684211,
+ "grad_norm": 0.637745201587677,
+ "learning_rate": 1.219071469678941e-06,
+ "loss": 0.1009,
+ "step": 492
+ },
+ {
+ "epoch": 4.324561403508772,
+ "grad_norm": 0.8204445242881775,
+ "learning_rate": 1.2075408874849747e-06,
+ "loss": 0.099,
+ "step": 493
+ },
+ {
+ "epoch": 4.333333333333333,
+ "grad_norm": 1.010758876800537,
+ "learning_rate": 1.1960477068596155e-06,
+ "loss": 0.1006,
+ "step": 494
+ },
+ {
+ "epoch": 4.342105263157895,
+ "grad_norm": 0.908112108707428,
+ "learning_rate": 1.1845922603959677e-06,
+ "loss": 0.1047,
+ "step": 495
+ },
+ {
+ "epoch": 4.350877192982456,
+ "grad_norm": 1.0254642963409424,
+ "learning_rate": 1.173174879595166e-06,
+ "loss": 0.0991,
+ "step": 496
+ },
+ {
+ "epoch": 4.359649122807017,
+ "grad_norm": 0.5159414410591125,
+ "learning_rate": 1.1617958948567967e-06,
+ "loss": 0.0978,
+ "step": 497
+ },
+ {
+ "epoch": 4.368421052631579,
+ "grad_norm": 0.9525816440582275,
+ "learning_rate": 1.1504556354693227e-06,
+ "loss": 0.1051,
+ "step": 498
+ },
+ {
+ "epoch": 4.37719298245614,
+ "grad_norm": 0.9321548938751221,
+ "learning_rate": 1.1391544296005652e-06,
+ "loss": 0.1011,
+ "step": 499
+ },
+ {
+ "epoch": 4.385964912280702,
+ "grad_norm": 0.7308889627456665,
+ "learning_rate": 1.1278926042882026e-06,
+ "loss": 0.1002,
+ "step": 500
+ },
+ {
+ "epoch": 4.394736842105263,
+ "grad_norm": 0.9508903622627258,
+ "learning_rate": 1.116670485430304e-06,
+ "loss": 0.1013,
+ "step": 501
+ },
+ {
+ "epoch": 4.4035087719298245,
+ "grad_norm": 0.5174031853675842,
+ "learning_rate": 1.1054883977759067e-06,
+ "loss": 0.104,
+ "step": 502
+ },
+ {
+ "epoch": 4.412280701754386,
+ "grad_norm": 0.4504610598087311,
+ "learning_rate": 1.0943466649156061e-06,
+ "loss": 0.1013,
+ "step": 503
+ },
+ {
+ "epoch": 4.421052631578947,
+ "grad_norm": 0.5650261044502258,
+ "learning_rate": 1.0832456092722063e-06,
+ "loss": 0.0995,
+ "step": 504
+ },
+ {
+ "epoch": 4.4298245614035086,
+ "grad_norm": 0.37759432196617126,
+ "learning_rate": 1.0721855520913751e-06,
+ "loss": 0.1058,
+ "step": 505
+ },
+ {
+ "epoch": 4.43859649122807,
+ "grad_norm": 0.7238495349884033,
+ "learning_rate": 1.0611668134323577e-06,
+ "loss": 0.1012,
+ "step": 506
+ },
+ {
+ "epoch": 4.447368421052632,
+ "grad_norm": 0.6301494240760803,
+ "learning_rate": 1.0501897121587127e-06,
+ "loss": 0.1009,
+ "step": 507
+ },
+ {
+ "epoch": 4.456140350877193,
+ "grad_norm": 0.9531002044677734,
+ "learning_rate": 1.0392545659290789e-06,
+ "loss": 0.1021,
+ "step": 508
+ },
+ {
+ "epoch": 4.464912280701754,
+ "grad_norm": 0.4423767924308777,
+ "learning_rate": 1.0283616911879943e-06,
+ "loss": 0.1024,
+ "step": 509
+ },
+ {
+ "epoch": 4.473684210526316,
+ "grad_norm": 0.5573019981384277,
+ "learning_rate": 1.0175114031567246e-06,
+ "loss": 0.1011,
+ "step": 510
+ },
+ {
+ "epoch": 4.482456140350878,
+ "grad_norm": 0.9792631268501282,
+ "learning_rate": 1.0067040158241555e-06,
+ "loss": 0.1039,
+ "step": 511
+ },
+ {
+ "epoch": 4.491228070175438,
+ "grad_norm": 1.7911303043365479,
+ "learning_rate": 9.95939841937693e-07,
+ "loss": 0.104,
+ "step": 512
+ },
+ {
+ "epoch": 4.5,
+ "grad_norm": 0.5825617909431458,
+ "learning_rate": 9.852191929942262e-07,
+ "loss": 0.0987,
+ "step": 513
+ },
+ {
+ "epoch": 4.508771929824562,
+ "grad_norm": 0.3129921555519104,
+ "learning_rate": 9.745423792310996e-07,
+ "loss": 0.0979,
+ "step": 514
+ },
+ {
+ "epoch": 4.517543859649123,
+ "grad_norm": 0.5376678705215454,
+ "learning_rate": 9.63909709617146e-07,
+ "loss": 0.0998,
+ "step": 515
+ },
+ {
+ "epoch": 4.526315789473684,
+ "grad_norm": 0.48920008540153503,
+ "learning_rate": 9.533214918437422e-07,
+ "loss": 0.1017,
+ "step": 516
+ },
+ {
+ "epoch": 4.535087719298246,
+ "grad_norm": 0.36829131841659546,
+ "learning_rate": 9.427780323159006e-07,
+ "loss": 0.1004,
+ "step": 517
+ },
+ {
+ "epoch": 4.543859649122807,
+ "grad_norm": 0.5459544658660889,
+ "learning_rate": 9.322796361434111e-07,
+ "loss": 0.1041,
+ "step": 518
+ },
+ {
+ "epoch": 4.552631578947368,
+ "grad_norm": 0.8460657000541687,
+ "learning_rate": 9.218266071320015e-07,
+ "loss": 0.1012,
+ "step": 519
+ },
+ {
+ "epoch": 4.56140350877193,
+ "grad_norm": 0.7692683339118958,
+ "learning_rate": 9.114192477745568e-07,
+ "loss": 0.1013,
+ "step": 520
+ },
+ {
+ "epoch": 4.5701754385964914,
+ "grad_norm": 0.4503592550754547,
+ "learning_rate": 9.010578592423544e-07,
+ "loss": 0.107,
+ "step": 521
+ },
+ {
+ "epoch": 4.578947368421053,
+ "grad_norm": 0.9348855018615723,
+ "learning_rate": 8.907427413763572e-07,
+ "loss": 0.102,
+ "step": 522
+ },
+ {
+ "epoch": 4.587719298245614,
+ "grad_norm": 0.7902988791465759,
+ "learning_rate": 8.804741926785335e-07,
+ "loss": 0.1032,
+ "step": 523
+ },
+ {
+ "epoch": 4.5964912280701755,
+ "grad_norm": 0.5444673299789429,
+ "learning_rate": 8.702525103032186e-07,
+ "loss": 0.0993,
+ "step": 524
+ },
+ {
+ "epoch": 4.605263157894737,
+ "grad_norm": 0.728112518787384,
+ "learning_rate": 8.60077990048517e-07,
+ "loss": 0.1021,
+ "step": 525
+ },
+ {
+ "epoch": 4.614035087719298,
+ "grad_norm": 0.5250695943832397,
+ "learning_rate": 8.499509263477388e-07,
+ "loss": 0.1018,
+ "step": 526
+ },
+ {
+ "epoch": 4.62280701754386,
+ "grad_norm": 0.3112829625606537,
+ "learning_rate": 8.398716122608868e-07,
+ "loss": 0.1037,
+ "step": 527
+ },
+ {
+ "epoch": 4.631578947368421,
+ "grad_norm": 0.9097342491149902,
+ "learning_rate": 8.298403394661658e-07,
+ "loss": 0.1015,
+ "step": 528
+ },
+ {
+ "epoch": 4.640350877192983,
+ "grad_norm": 0.6663810014724731,
+ "learning_rate": 8.198573982515537e-07,
+ "loss": 0.1038,
+ "step": 529
+ },
+ {
+ "epoch": 4.649122807017544,
+ "grad_norm": 1.1880309581756592,
+ "learning_rate": 8.099230775063879e-07,
+ "loss": 0.1044,
+ "step": 530
+ },
+ {
+ "epoch": 4.657894736842105,
+ "grad_norm": 0.6492993831634521,
+ "learning_rate": 8.000376647130165e-07,
+ "loss": 0.103,
+ "step": 531
+ },
+ {
+ "epoch": 4.666666666666667,
+ "grad_norm": 0.43723204731941223,
+ "learning_rate": 7.902014459384744e-07,
+ "loss": 0.1025,
+ "step": 532
+ },
+ {
+ "epoch": 4.675438596491228,
+ "grad_norm": 0.8422684669494629,
+ "learning_rate": 7.804147058262015e-07,
+ "loss": 0.1035,
+ "step": 533
+ },
+ {
+ "epoch": 4.684210526315789,
+ "grad_norm": 0.6502094268798828,
+ "learning_rate": 7.706777275878161e-07,
+ "loss": 0.0994,
+ "step": 534
+ },
+ {
+ "epoch": 4.692982456140351,
+ "grad_norm": 0.5709391236305237,
+ "learning_rate": 7.609907929949045e-07,
+ "loss": 0.1056,
+ "step": 535
+ },
+ {
+ "epoch": 4.701754385964913,
+ "grad_norm": 0.4126770496368408,
+ "learning_rate": 7.513541823708828e-07,
+ "loss": 0.101,
+ "step": 536
+ },
+ {
+ "epoch": 4.7105263157894735,
+ "grad_norm": 0.5016621947288513,
+ "learning_rate": 7.417681745828706e-07,
+ "loss": 0.0999,
+ "step": 537
+ },
+ {
+ "epoch": 4.719298245614035,
+ "grad_norm": 0.8139487504959106,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.0984,
+ "step": 538
+ },
+ {
+ "epoch": 4.728070175438597,
+ "grad_norm": 0.5805723667144775,
+ "learning_rate": 7.227490756535396e-07,
+ "loss": 0.1011,
+ "step": 539
+ },
+ {
+ "epoch": 4.7368421052631575,
+ "grad_norm": 0.7970795631408691,
+ "learning_rate": 7.133165348925978e-07,
+ "loss": 0.1016,
+ "step": 540
+ },
+ {
+ "epoch": 4.745614035087719,
+ "grad_norm": 0.6336880326271057,
+ "learning_rate": 7.039356977124937e-07,
+ "loss": 0.1027,
+ "step": 541
+ },
+ {
+ "epoch": 4.754385964912281,
+ "grad_norm": 0.2953254282474518,
+ "learning_rate": 6.946068355786992e-07,
+ "loss": 0.1022,
+ "step": 542
+ },
+ {
+ "epoch": 4.7631578947368425,
+ "grad_norm": 0.5646472573280334,
+ "learning_rate": 6.853302184526217e-07,
+ "loss": 0.0998,
+ "step": 543
+ },
+ {
+ "epoch": 4.771929824561403,
+ "grad_norm": 0.6545483469963074,
+ "learning_rate": 6.761061147837808e-07,
+ "loss": 0.0985,
+ "step": 544
+ },
+ {
+ "epoch": 4.780701754385965,
+ "grad_norm": 0.8741705417633057,
+ "learning_rate": 6.669347915020524e-07,
+ "loss": 0.1006,
+ "step": 545
+ },
+ {
+ "epoch": 4.7894736842105265,
+ "grad_norm": 0.8579487204551697,
+ "learning_rate": 6.578165140099318e-07,
+ "loss": 0.1037,
+ "step": 546
+ },
+ {
+ "epoch": 4.798245614035087,
+ "grad_norm": 1.0744833946228027,
+ "learning_rate": 6.487515461748631e-07,
+ "loss": 0.1017,
+ "step": 547
+ },
+ {
+ "epoch": 4.807017543859649,
+ "grad_norm": 0.4954414367675781,
+ "learning_rate": 6.397401503215992e-07,
+ "loss": 0.1006,
+ "step": 548
+ },
+ {
+ "epoch": 4.815789473684211,
+ "grad_norm": 0.525191068649292,
+ "learning_rate": 6.307825872246076e-07,
+ "loss": 0.1024,
+ "step": 549
+ },
+ {
+ "epoch": 4.824561403508772,
+ "grad_norm": 0.8922368288040161,
+ "learning_rate": 6.218791161005336e-07,
+ "loss": 0.0999,
+ "step": 550
+ },
+ {
+ "epoch": 4.833333333333333,
+ "grad_norm": 0.6471604704856873,
+ "learning_rate": 6.13029994600686e-07,
+ "loss": 0.0994,
+ "step": 551
+ },
+ {
+ "epoch": 4.842105263157895,
+ "grad_norm": 0.49826696515083313,
+ "learning_rate": 6.042354788035943e-07,
+ "loss": 0.1003,
+ "step": 552
+ },
+ {
+ "epoch": 4.850877192982456,
+ "grad_norm": 0.7908043265342712,
+ "learning_rate": 5.954958232075858e-07,
+ "loss": 0.1003,
+ "step": 553
+ },
+ {
+ "epoch": 4.859649122807017,
+ "grad_norm": 0.40011560916900635,
+ "learning_rate": 5.868112807234313e-07,
+ "loss": 0.0991,
+ "step": 554
+ },
+ {
+ "epoch": 4.868421052631579,
+ "grad_norm": 0.9797350764274597,
+ "learning_rate": 5.781821026670203e-07,
+ "loss": 0.1005,
+ "step": 555
+ },
+ {
+ "epoch": 4.87719298245614,
+ "grad_norm": 0.4581677317619324,
+ "learning_rate": 5.696085387520894e-07,
+ "loss": 0.1013,
+ "step": 556
+ },
+ {
+ "epoch": 4.885964912280702,
+ "grad_norm": 0.6596454381942749,
+ "learning_rate": 5.610908370829981e-07,
+ "loss": 0.1028,
+ "step": 557
+ },
+ {
+ "epoch": 4.894736842105263,
+ "grad_norm": 0.5106292963027954,
+ "learning_rate": 5.526292441475448e-07,
+ "loss": 0.1023,
+ "step": 558
+ },
+ {
+ "epoch": 4.9035087719298245,
+ "grad_norm": 0.5137461423873901,
+ "learning_rate": 5.442240048098402e-07,
+ "loss": 0.1036,
+ "step": 559
+ },
+ {
+ "epoch": 4.912280701754386,
+ "grad_norm": 0.4619182348251343,
+ "learning_rate": 5.358753623032137e-07,
+ "loss": 0.0979,
+ "step": 560
+ },
+ {
+ "epoch": 4.921052631578947,
+ "grad_norm": 0.5350770354270935,
+ "learning_rate": 5.275835582231833e-07,
+ "loss": 0.0992,
+ "step": 561
+ },
+ {
+ "epoch": 4.9298245614035086,
+ "grad_norm": 0.7599822878837585,
+ "learning_rate": 5.193488325204551e-07,
+ "loss": 0.0983,
+ "step": 562
+ },
+ {
+ "epoch": 4.93859649122807,
+ "grad_norm": 0.47537004947662354,
+ "learning_rate": 5.111714234939868e-07,
+ "loss": 0.1004,
+ "step": 563
+ },
+ {
+ "epoch": 4.947368421052632,
+ "grad_norm": 0.597273588180542,
+ "learning_rate": 5.030515677840883e-07,
+ "loss": 0.1015,
+ "step": 564
+ },
+ {
+ "epoch": 4.956140350877193,
+ "grad_norm": 0.7155528664588928,
+ "learning_rate": 4.949895003655728e-07,
+ "loss": 0.1017,
+ "step": 565
+ },
+ {
+ "epoch": 4.964912280701754,
+ "grad_norm": 0.530358612537384,
+ "learning_rate": 4.869854545409627e-07,
+ "loss": 0.0998,
+ "step": 566
+ },
+ {
+ "epoch": 4.973684210526316,
+ "grad_norm": 0.6721721291542053,
+ "learning_rate": 4.790396619337286e-07,
+ "loss": 0.1003,
+ "step": 567
+ },
+ {
+ "epoch": 4.982456140350877,
+ "grad_norm": 0.8486731648445129,
+ "learning_rate": 4.711523524815978e-07,
+ "loss": 0.0996,
+ "step": 568
+ },
+ {
+ "epoch": 4.991228070175438,
+ "grad_norm": 0.7072808742523193,
+ "learning_rate": 4.633237544298891e-07,
+ "loss": 0.1004,
+ "step": 569
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.41283953189849854,
+ "learning_rate": 4.555540943249187e-07,
+ "loss": 0.1026,
+ "step": 570
+ },
+ {
+ "epoch": 5.008771929824562,
+ "grad_norm": 0.7376545667648315,
+ "learning_rate": 4.478435970074341e-07,
+ "loss": 0.1001,
+ "step": 571
+ },
+ {
+ "epoch": 5.017543859649122,
+ "grad_norm": 0.42418381571769714,
+ "learning_rate": 4.401924856061146e-07,
+ "loss": 0.0998,
+ "step": 572
+ },
+ {
+ "epoch": 5.026315789473684,
+ "grad_norm": 0.5682939291000366,
+ "learning_rate": 4.326009815311125e-07,
+ "loss": 0.1015,
+ "step": 573
+ },
+ {
+ "epoch": 5.035087719298246,
+ "grad_norm": 0.6277433633804321,
+ "learning_rate": 4.250693044676429e-07,
+ "loss": 0.1067,
+ "step": 574
+ },
+ {
+ "epoch": 5.043859649122807,
+ "grad_norm": 0.8414298892021179,
+ "learning_rate": 4.175976723696337e-07,
+ "loss": 0.1007,
+ "step": 575
+ },
+ {
+ "epoch": 5.052631578947368,
+ "grad_norm": 0.48310723900794983,
+ "learning_rate": 4.1018630145340744e-07,
+ "loss": 0.0966,
+ "step": 576
+ },
+ {
+ "epoch": 5.06140350877193,
+ "grad_norm": 0.7204103469848633,
+ "learning_rate": 4.028354061914369e-07,
+ "loss": 0.1001,
+ "step": 577
+ },
+ {
+ "epoch": 5.0701754385964914,
+ "grad_norm": 0.4454537630081177,
+ "learning_rate": 3.9554519930612683e-07,
+ "loss": 0.0975,
+ "step": 578
+ },
+ {
+ "epoch": 5.078947368421052,
+ "grad_norm": 0.71866774559021,
+ "learning_rate": 3.88315891763667e-07,
+ "loss": 0.0995,
+ "step": 579
+ },
+ {
+ "epoch": 5.087719298245614,
+ "grad_norm": 0.5037544369697571,
+ "learning_rate": 3.811476927679228e-07,
+ "loss": 0.1003,
+ "step": 580
+ },
+ {
+ "epoch": 5.0964912280701755,
+ "grad_norm": 0.4898604154586792,
+ "learning_rate": 3.7404080975438073e-07,
+ "loss": 0.1006,
+ "step": 581
+ },
+ {
+ "epoch": 5.105263157894737,
+ "grad_norm": 0.5109504461288452,
+ "learning_rate": 3.6699544838415035e-07,
+ "loss": 0.0975,
+ "step": 582
+ },
+ {
+ "epoch": 5.114035087719298,
+ "grad_norm": 0.5904539227485657,
+ "learning_rate": 3.600118125380056e-07,
+ "loss": 0.1027,
+ "step": 583
+ },
+ {
+ "epoch": 5.12280701754386,
+ "grad_norm": 0.7211642265319824,
+ "learning_rate": 3.5309010431049284e-07,
+ "loss": 0.1025,
+ "step": 584
+ },
+ {
+ "epoch": 5.131578947368421,
+ "grad_norm": 0.6350153088569641,
+ "learning_rate": 3.462305240040739e-07,
+ "loss": 0.1003,
+ "step": 585
+ },
+ {
+ "epoch": 5.140350877192983,
+ "grad_norm": 0.4940623641014099,
+ "learning_rate": 3.394332701233391e-07,
+ "loss": 0.1009,
+ "step": 586
+ },
+ {
+ "epoch": 5.149122807017544,
+ "grad_norm": 0.6850067973136902,
+ "learning_rate": 3.326985393692539e-07,
+ "loss": 0.0976,
+ "step": 587
+ },
+ {
+ "epoch": 5.157894736842105,
+ "grad_norm": 0.5988023281097412,
+ "learning_rate": 3.260265266334725e-07,
+ "loss": 0.1002,
+ "step": 588
+ },
+ {
+ "epoch": 5.166666666666667,
+ "grad_norm": 0.4078713059425354,
+ "learning_rate": 3.1941742499269764e-07,
+ "loss": 0.1002,
+ "step": 589
+ },
+ {
+ "epoch": 5.175438596491228,
+ "grad_norm": 0.9976629614830017,
+ "learning_rate": 3.128714257030882e-07,
+ "loss": 0.0982,
+ "step": 590
+ },
+ {
+ "epoch": 5.184210526315789,
+ "grad_norm": 0.7467443346977234,
+ "learning_rate": 3.063887181947334e-07,
+ "loss": 0.0978,
+ "step": 591
+ },
+ {
+ "epoch": 5.192982456140351,
+ "grad_norm": 0.46989375352859497,
+ "learning_rate": 2.9996949006616096e-07,
+ "loss": 0.099,
+ "step": 592
+ },
+ {
+ "epoch": 5.201754385964913,
+ "grad_norm": 0.6407843828201294,
+ "learning_rate": 2.9361392707891763e-07,
+ "loss": 0.1009,
+ "step": 593
+ },
+ {
+ "epoch": 5.2105263157894735,
+ "grad_norm": 0.5148762464523315,
+ "learning_rate": 2.8732221315218576e-07,
+ "loss": 0.1048,
+ "step": 594
+ },
+ {
+ "epoch": 5.219298245614035,
+ "grad_norm": 1.0204253196716309,
+ "learning_rate": 2.810945303574664e-07,
+ "loss": 0.1032,
+ "step": 595
+ },
+ {
+ "epoch": 5.228070175438597,
+ "grad_norm": 0.5452238321304321,
+ "learning_rate": 2.7493105891330837e-07,
+ "loss": 0.0987,
+ "step": 596
+ },
+ {
+ "epoch": 5.2368421052631575,
+ "grad_norm": 0.561916708946228,
+ "learning_rate": 2.688319771800929e-07,
+ "loss": 0.0972,
+ "step": 597
+ },
+ {
+ "epoch": 5.245614035087719,
+ "grad_norm": 0.4652751684188843,
+ "learning_rate": 2.6279746165487256e-07,
+ "loss": 0.0991,
+ "step": 598
+ },
+ {
+ "epoch": 5.254385964912281,
+ "grad_norm": 0.8166212439537048,
+ "learning_rate": 2.568276869662628e-07,
+ "loss": 0.0998,
+ "step": 599
+ },
+ {
+ "epoch": 5.2631578947368425,
+ "grad_norm": 0.5090087056159973,
+ "learning_rate": 2.5092282586939187e-07,
+ "loss": 0.1011,
+ "step": 600
+ },
+ {
+ "epoch": 5.271929824561403,
+ "grad_norm": 0.8435099720954895,
+ "learning_rate": 2.450830492408954e-07,
+ "loss": 0.1016,
+ "step": 601
+ },
+ {
+ "epoch": 5.280701754385965,
+ "grad_norm": 0.8541790843009949,
+ "learning_rate": 2.393085260739794e-07,
+ "loss": 0.1034,
+ "step": 602
+ },
+ {
+ "epoch": 5.2894736842105265,
+ "grad_norm": 0.7966872453689575,
+ "learning_rate": 2.3359942347352172e-07,
+ "loss": 0.0996,
+ "step": 603
+ },
+ {
+ "epoch": 5.298245614035087,
+ "grad_norm": 0.6361204981803894,
+ "learning_rate": 2.2795590665124267e-07,
+ "loss": 0.1007,
+ "step": 604
+ },
+ {
+ "epoch": 5.307017543859649,
+ "grad_norm": 0.4418005049228668,
+ "learning_rate": 2.2237813892092175e-07,
+ "loss": 0.0983,
+ "step": 605
+ },
+ {
+ "epoch": 5.315789473684211,
+ "grad_norm": 0.3627215623855591,
+ "learning_rate": 2.1686628169366923e-07,
+ "loss": 0.1016,
+ "step": 606
+ },
+ {
+ "epoch": 5.324561403508772,
+ "grad_norm": 0.6289935111999512,
+ "learning_rate": 2.114204944732609e-07,
+ "loss": 0.1024,
+ "step": 607
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 0.5116890668869019,
+ "learning_rate": 2.0604093485151548e-07,
+ "loss": 0.1005,
+ "step": 608
+ },
+ {
+ "epoch": 5.342105263157895,
+ "grad_norm": 0.344194620847702,
+ "learning_rate": 2.007277585037412e-07,
+ "loss": 0.1007,
+ "step": 609
+ },
+ {
+ "epoch": 5.350877192982456,
+ "grad_norm": 1.0403063297271729,
+ "learning_rate": 1.95481119184224e-07,
+ "loss": 0.1006,
+ "step": 610
+ },
+ {
+ "epoch": 5.359649122807017,
+ "grad_norm": 0.32791537046432495,
+ "learning_rate": 1.9030116872178317e-07,
+ "loss": 0.1007,
+ "step": 611
+ },
+ {
+ "epoch": 5.368421052631579,
+ "grad_norm": 0.6505579948425293,
+ "learning_rate": 1.851880570153755e-07,
+ "loss": 0.1049,
+ "step": 612
+ },
+ {
+ "epoch": 5.37719298245614,
+ "grad_norm": 0.726384162902832,
+ "learning_rate": 1.801419320297576e-07,
+ "loss": 0.1023,
+ "step": 613
+ },
+ {
+ "epoch": 5.385964912280702,
+ "grad_norm": 1.0476131439208984,
+ "learning_rate": 1.7516293979120525e-07,
+ "loss": 0.0984,
+ "step": 614
+ },
+ {
+ "epoch": 5.394736842105263,
+ "grad_norm": 0.8576235771179199,
+ "learning_rate": 1.7025122438328434e-07,
+ "loss": 0.1006,
+ "step": 615
+ },
+ {
+ "epoch": 5.4035087719298245,
+ "grad_norm": 0.2774132788181305,
+ "learning_rate": 1.654069279426873e-07,
+ "loss": 0.0986,
+ "step": 616
+ },
+ {
+ "epoch": 5.412280701754386,
+ "grad_norm": 0.43602442741394043,
+ "learning_rate": 1.6063019065511276e-07,
+ "loss": 0.0992,
+ "step": 617
+ },
+ {
+ "epoch": 5.421052631578947,
+ "grad_norm": 0.6421550512313843,
+ "learning_rate": 1.5592115075121512e-07,
+ "loss": 0.1017,
+ "step": 618
+ },
+ {
+ "epoch": 5.4298245614035086,
+ "grad_norm": 0.7895707488059998,
+ "learning_rate": 1.5127994450259976e-07,
+ "loss": 0.097,
+ "step": 619
+ },
+ {
+ "epoch": 5.43859649122807,
+ "grad_norm": 0.5679956674575806,
+ "learning_rate": 1.467067062178823e-07,
+ "loss": 0.0996,
+ "step": 620
+ },
+ {
+ "epoch": 5.447368421052632,
+ "grad_norm": 0.48801174759864807,
+ "learning_rate": 1.4220156823880144e-07,
+ "loss": 0.1034,
+ "step": 621
+ },
+ {
+ "epoch": 5.456140350877193,
+ "grad_norm": 0.4325696527957916,
+ "learning_rate": 1.3776466093638696e-07,
+ "loss": 0.0979,
+ "step": 622
+ },
+ {
+ "epoch": 5.464912280701754,
+ "grad_norm": 0.38854703307151794,
+ "learning_rate": 1.3339611270719198e-07,
+ "loss": 0.0998,
+ "step": 623
+ },
+ {
+ "epoch": 5.473684210526316,
+ "grad_norm": 0.698753833770752,
+ "learning_rate": 1.2909604996957093e-07,
+ "loss": 0.1008,
+ "step": 624
+ },
+ {
+ "epoch": 5.482456140350878,
+ "grad_norm": 0.6861230134963989,
+ "learning_rate": 1.2486459716002792e-07,
+ "loss": 0.1029,
+ "step": 625
+ },
+ {
+ "epoch": 5.491228070175438,
+ "grad_norm": 0.564124345779419,
+ "learning_rate": 1.2070187672960948e-07,
+ "loss": 0.1036,
+ "step": 626
+ },
+ {
+ "epoch": 5.5,
+ "grad_norm": 0.47016748785972595,
+ "learning_rate": 1.1660800914036568e-07,
+ "loss": 0.0999,
+ "step": 627
+ },
+ {
+ "epoch": 5.508771929824562,
+ "grad_norm": 0.6495513319969177,
+ "learning_rate": 1.1258311286186208e-07,
+ "loss": 0.0995,
+ "step": 628
+ },
+ {
+ "epoch": 5.517543859649123,
+ "grad_norm": 0.312717080116272,
+ "learning_rate": 1.086273043677516e-07,
+ "loss": 0.098,
+ "step": 629
+ },
+ {
+ "epoch": 5.526315789473684,
+ "grad_norm": 0.6478825211524963,
+ "learning_rate": 1.0474069813240505e-07,
+ "loss": 0.098,
+ "step": 630
+ },
+ {
+ "epoch": 5.535087719298246,
+ "grad_norm": 0.7767362594604492,
+ "learning_rate": 1.0092340662759548e-07,
+ "loss": 0.1022,
+ "step": 631
+ },
+ {
+ "epoch": 5.543859649122807,
+ "grad_norm": 0.5980598330497742,
+ "learning_rate": 9.717554031924842e-08,
+ "loss": 0.0977,
+ "step": 632
+ },
+ {
+ "epoch": 5.552631578947368,
+ "grad_norm": 0.7471850514411926,
+ "learning_rate": 9.349720766423931e-08,
+ "loss": 0.0991,
+ "step": 633
+ },
+ {
+ "epoch": 5.56140350877193,
+ "grad_norm": 0.48221901059150696,
+ "learning_rate": 8.988851510726093e-08,
+ "loss": 0.0985,
+ "step": 634
+ },
+ {
+ "epoch": 5.5701754385964914,
+ "grad_norm": 0.8782841563224792,
+ "learning_rate": 8.634956707773729e-08,
+ "loss": 0.1025,
+ "step": 635
+ },
+ {
+ "epoch": 5.578947368421053,
+ "grad_norm": 0.35953524708747864,
+ "learning_rate": 8.288046598680627e-08,
+ "loss": 0.1016,
+ "step": 636
+ },
+ {
+ "epoch": 5.587719298245614,
+ "grad_norm": 0.3914284408092499,
+ "learning_rate": 7.948131222435346e-08,
+ "loss": 0.096,
+ "step": 637
+ },
+ {
+ "epoch": 5.5964912280701755,
+ "grad_norm": 0.5373840928077698,
+ "learning_rate": 7.61522041561069e-08,
+ "loss": 0.1005,
+ "step": 638
+ },
+ {
+ "epoch": 5.605263157894737,
+ "grad_norm": 0.6877533197402954,
+ "learning_rate": 7.289323812079363e-08,
+ "loss": 0.0974,
+ "step": 639
+ },
+ {
+ "epoch": 5.614035087719298,
+ "grad_norm": 0.6217812299728394,
+ "learning_rate": 6.97045084273465e-08,
+ "loss": 0.0988,
+ "step": 640
+ },
+ {
+ "epoch": 5.62280701754386,
+ "grad_norm": 0.5998544692993164,
+ "learning_rate": 6.658610735218147e-08,
+ "loss": 0.101,
+ "step": 641
+ },
+ {
+ "epoch": 5.631578947368421,
+ "grad_norm": 0.44546636939048767,
+ "learning_rate": 6.353812513652052e-08,
+ "loss": 0.0993,
+ "step": 642
+ },
+ {
+ "epoch": 5.640350877192983,
+ "grad_norm": 0.5359933972358704,
+ "learning_rate": 6.056064998378658e-08,
+ "loss": 0.1039,
+ "step": 643
+ },
+ {
+ "epoch": 5.649122807017544,
+ "grad_norm": 0.45402801036834717,
+ "learning_rate": 5.7653768057045757e-08,
+ "loss": 0.1008,
+ "step": 644
+ },
+ {
+ "epoch": 5.657894736842105,
+ "grad_norm": 0.6362654566764832,
+ "learning_rate": 5.481756347651773e-08,
+ "loss": 0.0968,
+ "step": 645
+ },
+ {
+ "epoch": 5.666666666666667,
+ "grad_norm": 0.3837541937828064,
+ "learning_rate": 5.205211831713935e-08,
+ "loss": 0.1001,
+ "step": 646
+ },
+ {
+ "epoch": 5.675438596491228,
+ "grad_norm": 0.4877745807170868,
+ "learning_rate": 4.935751260618987e-08,
+ "loss": 0.1021,
+ "step": 647
+ },
+ {
+ "epoch": 5.684210526315789,
+ "grad_norm": 0.5268471837043762,
+ "learning_rate": 4.6733824320976674e-08,
+ "loss": 0.1016,
+ "step": 648
+ },
+ {
+ "epoch": 5.692982456140351,
+ "grad_norm": 0.5390419363975525,
+ "learning_rate": 4.418112938657571e-08,
+ "loss": 0.1016,
+ "step": 649
+ },
+ {
+ "epoch": 5.701754385964913,
+ "grad_norm": 0.6146634221076965,
+ "learning_rate": 4.169950167363768e-08,
+ "loss": 0.0948,
+ "step": 650
+ },
+ {
+ "epoch": 5.7105263157894735,
+ "grad_norm": 0.5784945487976074,
+ "learning_rate": 3.928901299624782e-08,
+ "loss": 0.1007,
+ "step": 651
+ },
+ {
+ "epoch": 5.719298245614035,
+ "grad_norm": 0.8223549723625183,
+ "learning_rate": 3.6949733109848395e-08,
+ "loss": 0.1011,
+ "step": 652
+ },
+ {
+ "epoch": 5.728070175438597,
+ "grad_norm": 0.9502666592597961,
+ "learning_rate": 3.468172970922168e-08,
+ "loss": 0.102,
+ "step": 653
+ },
+ {
+ "epoch": 5.7368421052631575,
+ "grad_norm": 0.5113492608070374,
+ "learning_rate": 3.248506842652793e-08,
+ "loss": 0.101,
+ "step": 654
+ },
+ {
+ "epoch": 5.745614035087719,
+ "grad_norm": 1.0006201267242432,
+ "learning_rate": 3.0359812829409694e-08,
+ "loss": 0.0987,
+ "step": 655
+ },
+ {
+ "epoch": 5.754385964912281,
+ "grad_norm": 0.6877694129943848,
+ "learning_rate": 2.8306024419148814e-08,
+ "loss": 0.1003,
+ "step": 656
+ },
+ {
+ "epoch": 5.7631578947368425,
+ "grad_norm": 0.4734198749065399,
+ "learning_rate": 2.6323762628889804e-08,
+ "loss": 0.0975,
+ "step": 657
+ },
+ {
+ "epoch": 5.771929824561403,
+ "grad_norm": 0.8467719554901123,
+ "learning_rate": 2.4413084821916232e-08,
+ "loss": 0.0978,
+ "step": 658
+ },
+ {
+ "epoch": 5.780701754385965,
+ "grad_norm": 0.47460225224494934,
+ "learning_rate": 2.2574046289995933e-08,
+ "loss": 0.1001,
+ "step": 659
+ },
+ {
+ "epoch": 5.7894736842105265,
+ "grad_norm": 0.37792477011680603,
+ "learning_rate": 2.0806700251775057e-08,
+ "loss": 0.1002,
+ "step": 660
+ },
+ {
+ "epoch": 5.798245614035087,
+ "grad_norm": 0.7944504618644714,
+ "learning_rate": 1.9111097851242654e-08,
+ "loss": 0.0997,
+ "step": 661
+ },
+ {
+ "epoch": 5.807017543859649,
+ "grad_norm": 0.3530051112174988,
+ "learning_rate": 1.7487288156248782e-08,
+ "loss": 0.1021,
+ "step": 662
+ },
+ {
+ "epoch": 5.815789473684211,
+ "grad_norm": 0.6301564574241638,
+ "learning_rate": 1.593531815708371e-08,
+ "loss": 0.1023,
+ "step": 663
+ },
+ {
+ "epoch": 5.824561403508772,
+ "grad_norm": 0.5501565337181091,
+ "learning_rate": 1.4455232765120397e-08,
+ "loss": 0.1014,
+ "step": 664
+ },
+ {
+ "epoch": 5.833333333333333,
+ "grad_norm": 0.5270814299583435,
+ "learning_rate": 1.3047074811512184e-08,
+ "loss": 0.099,
+ "step": 665
+ },
+ {
+ "epoch": 5.842105263157895,
+ "grad_norm": 0.6463411450386047,
+ "learning_rate": 1.1710885045956022e-08,
+ "loss": 0.0987,
+ "step": 666
+ },
+ {
+ "epoch": 5.850877192982456,
+ "grad_norm": 1.0232126712799072,
+ "learning_rate": 1.0446702135511188e-08,
+ "loss": 0.1017,
+ "step": 667
+ },
+ {
+ "epoch": 5.859649122807017,
+ "grad_norm": 0.3154284954071045,
+ "learning_rate": 9.25456266348046e-09,
+ "loss": 0.0921,
+ "step": 668
+ },
+ {
+ "epoch": 5.868421052631579,
+ "grad_norm": 0.7173347473144531,
+ "learning_rate": 8.134501128353456e-09,
+ "loss": 0.1007,
+ "step": 669
+ },
+ {
+ "epoch": 5.87719298245614,
+ "grad_norm": 0.6975192427635193,
+ "learning_rate": 7.086549942805499e-09,
+ "loss": 0.1031,
+ "step": 670
+ },
+ {
+ "epoch": 5.885964912280702,
+ "grad_norm": 0.7983221411705017,
+ "learning_rate": 6.110739432762247e-09,
+ "loss": 0.0991,
+ "step": 671
+ },
+ {
+ "epoch": 5.894736842105263,
+ "grad_norm": 0.8028814196586609,
+ "learning_rate": 5.20709783651957e-09,
+ "loss": 0.0942,
+ "step": 672
+ },
+ {
+ "epoch": 5.9035087719298245,
+ "grad_norm": 0.7531240582466125,
+ "learning_rate": 4.375651303928918e-09,
+ "loss": 0.1025,
+ "step": 673
+ },
+ {
+ "epoch": 5.912280701754386,
+ "grad_norm": 0.5777604579925537,
+ "learning_rate": 3.6164238956384878e-09,
+ "loss": 0.1,
+ "step": 674
+ },
+ {
+ "epoch": 5.921052631578947,
+ "grad_norm": 0.7759271264076233,
+ "learning_rate": 2.929437582398775e-09,
+ "loss": 0.0992,
+ "step": 675
+ },
+ {
+ "epoch": 5.9298245614035086,
+ "grad_norm": 0.38214001059532166,
+ "learning_rate": 2.3147122444250327e-09,
+ "loss": 0.1021,
+ "step": 676
+ },
+ {
+ "epoch": 5.93859649122807,
+ "grad_norm": 0.7428710460662842,
+ "learning_rate": 1.7722656708230034e-09,
+ "loss": 0.0989,
+ "step": 677
+ },
+ {
+ "epoch": 5.947368421052632,
+ "grad_norm": 0.6303841471672058,
+ "learning_rate": 1.3021135590740585e-09,
+ "loss": 0.0993,
+ "step": 678
+ },
+ {
+ "epoch": 5.956140350877193,
+ "grad_norm": 0.8671356439590454,
+ "learning_rate": 9.04269514580558e-10,
+ "loss": 0.1026,
+ "step": 679
+ },
+ {
+ "epoch": 5.964912280701754,
+ "grad_norm": 0.4996141195297241,
+ "learning_rate": 5.787450502728331e-10,
+ "loss": 0.1033,
+ "step": 680
+ },
+ {
+ "epoch": 5.973684210526316,
+ "grad_norm": 0.625603199005127,
+ "learning_rate": 3.255495862750091e-10,
+ "loss": 0.1038,
+ "step": 681
+ },
+ {
+ "epoch": 5.982456140350877,
+ "grad_norm": 0.674436628818512,
+ "learning_rate": 1.446904496335555e-10,
+ "loss": 0.0969,
+ "step": 682
+ },
+ {
+ "epoch": 5.991228070175438,
+ "grad_norm": 0.778946578502655,
+ "learning_rate": 3.6172874103845845e-11,
+ "loss": 0.1004,
+ "step": 683
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5384402275085449,
+ "learning_rate": 0.0,
+ "loss": 0.0971,
+ "step": 684
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 684,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 114,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.075994044489112e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-684/training_args.bin b/checkpoint-684/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf
--- /dev/null
+++ b/checkpoint-684/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae
+size 7992
diff --git a/checkpoint-684/zero_to_fp32.py b/checkpoint-684/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-684/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7324bbcd3035a34c2ac96f0e2a46dd94a5db25c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,1497 @@
+{
+ "_attn_implementation_autoset": true,
+ "_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+ "architectures": [
+ "DeciLMForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "auto_map": {
+ "AutoConfig": "nvidia/Llama-3_3-Nemotron-Super-49B-v1--configuration_decilm.DeciLMConfig",
+ "AutoModelForCausalLM": "nvidia/Llama-3_3-Nemotron-Super-49B-v1--modeling_decilm.DeciLMForCausalLM"
+ },
+ "block_configs": [
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 3.28125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 2.625,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.3125,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 1.0,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": null,
+ "no_op": true,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 0.5,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ },
+ {
+ "attention": {
+ "n_heads_in_group": 8,
+ "no_op": false,
+ "num_sink_tokens": null,
+ "replace_with_linear": false,
+ "sparsify": null,
+ "unshifted_sink": false,
+ "use_prefill_window_in_sink_attention": false,
+ "window_length": null
+ },
+ "ffn": {
+ "ffn_mult": 5.25,
+ "no_op": false,
+ "replace_with_linear": false,
+ "sparsify": null
+ }
+ }
+ ],
+ "bos_token_id": 128000,
+ "eos_token_id": 128009,
+ "hidden_act": "silu",
+ "hidden_size": 8192,
+ "initializer_range": 0.02,
+ "intermediate_size": null,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "nemotron-nas",
+ "num_attention_heads": 64,
+ "num_hidden_layers": 80,
+ "num_key_value_heads": null,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": true,
+ "_load_in_8bit": false,
+ "bnb_4bit_compute_dtype": "bfloat16",
+ "bnb_4bit_quant_storage": "bfloat16",
+ "bnb_4bit_quant_type": "nf4",
+ "bnb_4bit_use_double_quant": true,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": true,
+ "load_in_8bit": false,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 8.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.49.0",
+ "use_cache": false,
+ "vocab_size": 128256
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}