enoch10jason commited on Oct 4, 2025

Commit

e2fc576

verified ·

1 Parent(s): af5ca10

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

README.md +202 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
added_tokens.json +13 -0
checkpoint-4600/README.md +202 -0
checkpoint-4600/adapter_config.json +31 -0
checkpoint-4600/adapter_model.safetensors +3 -0
checkpoint-4600/added_tokens.json +13 -0
checkpoint-4600/optimizer.pt +3 -0
checkpoint-4600/rng_state.pth +3 -0
checkpoint-4600/scheduler.pt +3 -0
checkpoint-4600/special_tokens_map.json +30 -0
checkpoint-4600/tokenizer.json +0 -0
checkpoint-4600/tokenizer.model +3 -0
checkpoint-4600/tokenizer_config.json +130 -0
checkpoint-4600/trainer_state.json +1643 -0
checkpoint-4600/training_args.bin +3 -0
checkpoint-4800/README.md +202 -0
checkpoint-4800/adapter_config.json +31 -0
checkpoint-4800/adapter_model.safetensors +3 -0
checkpoint-4800/added_tokens.json +13 -0
checkpoint-4800/optimizer.pt +3 -0
checkpoint-4800/rng_state.pth +3 -0
checkpoint-4800/scheduler.pt +3 -0
checkpoint-4800/special_tokens_map.json +30 -0
checkpoint-4800/tokenizer.json +0 -0
checkpoint-4800/tokenizer.model +3 -0
checkpoint-4800/tokenizer_config.json +130 -0
checkpoint-4800/trainer_state.json +1713 -0
checkpoint-4800/training_args.bin +3 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +130 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/phi-3-mini-4k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_up_proj",
+    "qkv_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d76eadb0ee98f0acc29bf9df6e692ca0dcb6a6863e6ca6332fb1622acc2661
+size 443717160

added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

checkpoint-4600/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/phi-3-mini-4k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-4600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_up_proj",
+    "qkv_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-4600/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f31c6bf479cced792e9c9c18094869bc0995dc5f5b08fc1af44428c0472c902e
+size 443717160

checkpoint-4600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

checkpoint-4600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f4c22f1e20185898445f4c766ec5990b0108cd7a51283ff5e0fc612177421aa
+size 100878458

checkpoint-4600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab3ad5cc3207de72890db89e3fa5f408c872c6188b088e48a8c966cecad9b91b
+size 14244

checkpoint-4600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2e48902735ae070f9ec317233927b6040a5820a89e1922f9e18f6291ae5d7f1
+size 1064

checkpoint-4600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-4600/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-4600/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-4600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-4600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1643 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.920046002300115,
+  "eval_steps": 500,
+  "global_step": 4600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0040002000100005,
+      "grad_norm": 0.8278714418411255,
+      "learning_rate": 4.980996199239848e-05,
+      "loss": 2.8118,
+      "step": 20
+    },
+    {
+      "epoch": 0.008000400020001,
+      "grad_norm": 1.106671690940857,
+      "learning_rate": 4.960992198439688e-05,
+      "loss": 2.3191,
+      "step": 40
+    },
+    {
+      "epoch": 0.0120006000300015,
+      "grad_norm": 0.8004185557365417,
+      "learning_rate": 4.9409881976395286e-05,
+      "loss": 2.0999,
+      "step": 60
+    },
+    {
+      "epoch": 0.016000800040002,
+      "grad_norm": 0.8842968344688416,
+      "learning_rate": 4.920984196839368e-05,
+      "loss": 2.0641,
+      "step": 80
+    },
+    {
+      "epoch": 0.0200010000500025,
+      "grad_norm": 0.7633001208305359,
+      "learning_rate": 4.900980196039208e-05,
+      "loss": 2.122,
+      "step": 100
+    },
+    {
+      "epoch": 0.024001200060003,
+      "grad_norm": 1.039692759513855,
+      "learning_rate": 4.8809761952390485e-05,
+      "loss": 1.9996,
+      "step": 120
+    },
+    {
+      "epoch": 0.0280014000700035,
+      "grad_norm": 0.6684374809265137,
+      "learning_rate": 4.860972194438888e-05,
+      "loss": 2.1058,
+      "step": 140
+    },
+    {
+      "epoch": 0.032001600080004,
+      "grad_norm": 0.6902151107788086,
+      "learning_rate": 4.840968193638728e-05,
+      "loss": 2.1403,
+      "step": 160
+    },
+    {
+      "epoch": 0.0360018000900045,
+      "grad_norm": 0.6190841794013977,
+      "learning_rate": 4.820964192838568e-05,
+      "loss": 2.0701,
+      "step": 180
+    },
+    {
+      "epoch": 0.040002000100005,
+      "grad_norm": 0.6660407781600952,
+      "learning_rate": 4.800960192038408e-05,
+      "loss": 2.1249,
+      "step": 200
+    },
+    {
+      "epoch": 0.0440022001100055,
+      "grad_norm": 0.8184217214584351,
+      "learning_rate": 4.780956191238248e-05,
+      "loss": 1.9919,
+      "step": 220
+    },
+    {
+      "epoch": 0.048002400120006,
+      "grad_norm": 0.8200207352638245,
+      "learning_rate": 4.7609521904380875e-05,
+      "loss": 2.0187,
+      "step": 240
+    },
+    {
+      "epoch": 0.0520026001300065,
+      "grad_norm": 0.731918215751648,
+      "learning_rate": 4.740948189637928e-05,
+      "loss": 1.9556,
+      "step": 260
+    },
+    {
+      "epoch": 0.056002800140007,
+      "grad_norm": 0.839863121509552,
+      "learning_rate": 4.720944188837768e-05,
+      "loss": 2.0764,
+      "step": 280
+    },
+    {
+      "epoch": 0.0600030001500075,
+      "grad_norm": 0.6660728454589844,
+      "learning_rate": 4.7009401880376074e-05,
+      "loss": 1.9794,
+      "step": 300
+    },
+    {
+      "epoch": 0.064003200160008,
+      "grad_norm": 0.880646824836731,
+      "learning_rate": 4.680936187237448e-05,
+      "loss": 2.0677,
+      "step": 320
+    },
+    {
+      "epoch": 0.0680034001700085,
+      "grad_norm": 0.6575286984443665,
+      "learning_rate": 4.6609321864372876e-05,
+      "loss": 2.2138,
+      "step": 340
+    },
+    {
+      "epoch": 0.072003600180009,
+      "grad_norm": 0.7487124800682068,
+      "learning_rate": 4.640928185637128e-05,
+      "loss": 2.1787,
+      "step": 360
+    },
+    {
+      "epoch": 0.0760038001900095,
+      "grad_norm": 0.7617707252502441,
+      "learning_rate": 4.620924184836968e-05,
+      "loss": 2.0341,
+      "step": 380
+    },
+    {
+      "epoch": 0.08000400020001,
+      "grad_norm": 0.7258076667785645,
+      "learning_rate": 4.6009201840368075e-05,
+      "loss": 2.2021,
+      "step": 400
+    },
+    {
+      "epoch": 0.0840042002100105,
+      "grad_norm": 1.1093262434005737,
+      "learning_rate": 4.580916183236648e-05,
+      "loss": 2.1792,
+      "step": 420
+    },
+    {
+      "epoch": 0.088004400220011,
+      "grad_norm": 0.6948565244674683,
+      "learning_rate": 4.5609121824364876e-05,
+      "loss": 1.9883,
+      "step": 440
+    },
+    {
+      "epoch": 0.0920046002300115,
+      "grad_norm": 0.8213192820549011,
+      "learning_rate": 4.540908181636327e-05,
+      "loss": 2.08,
+      "step": 460
+    },
+    {
+      "epoch": 0.096004800240012,
+      "grad_norm": 0.6234931349754333,
+      "learning_rate": 4.520904180836167e-05,
+      "loss": 2.0775,
+      "step": 480
+    },
+    {
+      "epoch": 0.10000500025001251,
+      "grad_norm": 0.7921673059463501,
+      "learning_rate": 4.5009001800360075e-05,
+      "loss": 1.9584,
+      "step": 500
+    },
+    {
+      "epoch": 0.104005200260013,
+      "grad_norm": 0.5997710227966309,
+      "learning_rate": 4.480896179235847e-05,
+      "loss": 2.0239,
+      "step": 520
+    },
+    {
+      "epoch": 0.1080054002700135,
+      "grad_norm": 0.6408416032791138,
+      "learning_rate": 4.460892178435687e-05,
+      "loss": 2.0004,
+      "step": 540
+    },
+    {
+      "epoch": 0.112005600280014,
+      "grad_norm": 0.712670087814331,
+      "learning_rate": 4.4408881776355274e-05,
+      "loss": 2.1011,
+      "step": 560
+    },
+    {
+      "epoch": 0.1160058002900145,
+      "grad_norm": 0.8731632828712463,
+      "learning_rate": 4.420884176835367e-05,
+      "loss": 1.9921,
+      "step": 580
+    },
+    {
+      "epoch": 0.120006000300015,
+      "grad_norm": 0.7469409108161926,
+      "learning_rate": 4.4008801760352075e-05,
+      "loss": 1.9548,
+      "step": 600
+    },
+    {
+      "epoch": 0.1240062003100155,
+      "grad_norm": 0.9184134602546692,
+      "learning_rate": 4.380876175235047e-05,
+      "loss": 1.9798,
+      "step": 620
+    },
+    {
+      "epoch": 0.128006400320016,
+      "grad_norm": 0.6513165831565857,
+      "learning_rate": 4.360872174434888e-05,
+      "loss": 1.9918,
+      "step": 640
+    },
+    {
+      "epoch": 0.1320066003300165,
+      "grad_norm": 0.8042064905166626,
+      "learning_rate": 4.3408681736347274e-05,
+      "loss": 2.0268,
+      "step": 660
+    },
+    {
+      "epoch": 0.136006800340017,
+      "grad_norm": 0.6575964093208313,
+      "learning_rate": 4.320864172834567e-05,
+      "loss": 1.9958,
+      "step": 680
+    },
+    {
+      "epoch": 0.1400070003500175,
+      "grad_norm": 0.6663448214530945,
+      "learning_rate": 4.300860172034407e-05,
+      "loss": 2.0274,
+      "step": 700
+    },
+    {
+      "epoch": 0.144007200360018,
+      "grad_norm": 0.6880171298980713,
+      "learning_rate": 4.280856171234247e-05,
+      "loss": 1.9933,
+      "step": 720
+    },
+    {
+      "epoch": 0.1480074003700185,
+      "grad_norm": 1.1512036323547363,
+      "learning_rate": 4.260852170434087e-05,
+      "loss": 2.0047,
+      "step": 740
+    },
+    {
+      "epoch": 0.152007600380019,
+      "grad_norm": 0.9109539985656738,
+      "learning_rate": 4.240848169633927e-05,
+      "loss": 2.0879,
+      "step": 760
+    },
+    {
+      "epoch": 0.1560078003900195,
+      "grad_norm": 0.6704302430152893,
+      "learning_rate": 4.220844168833767e-05,
+      "loss": 2.0508,
+      "step": 780
+    },
+    {
+      "epoch": 0.16000800040002,
+      "grad_norm": 0.5899685621261597,
+      "learning_rate": 4.200840168033607e-05,
+      "loss": 2.1563,
+      "step": 800
+    },
+    {
+      "epoch": 0.1640082004100205,
+      "grad_norm": 0.8644593954086304,
+      "learning_rate": 4.1808361672334466e-05,
+      "loss": 1.9691,
+      "step": 820
+    },
+    {
+      "epoch": 0.168008400420021,
+      "grad_norm": 0.7722281813621521,
+      "learning_rate": 4.1608321664332864e-05,
+      "loss": 2.1047,
+      "step": 840
+    },
+    {
+      "epoch": 0.1720086004300215,
+      "grad_norm": 0.6952481269836426,
+      "learning_rate": 4.140828165633127e-05,
+      "loss": 2.0815,
+      "step": 860
+    },
+    {
+      "epoch": 0.176008800440022,
+      "grad_norm": 0.818111777305603,
+      "learning_rate": 4.1208241648329665e-05,
+      "loss": 2.0609,
+      "step": 880
+    },
+    {
+      "epoch": 0.1800090004500225,
+      "grad_norm": 0.7889490127563477,
+      "learning_rate": 4.100820164032807e-05,
+      "loss": 2.0989,
+      "step": 900
+    },
+    {
+      "epoch": 0.184009200460023,
+      "grad_norm": 0.9778928160667419,
+      "learning_rate": 4.0808161632326467e-05,
+      "loss": 2.0217,
+      "step": 920
+    },
+    {
+      "epoch": 0.1880094004700235,
+      "grad_norm": 0.6434693336486816,
+      "learning_rate": 4.060812162432487e-05,
+      "loss": 1.9841,
+      "step": 940
+    },
+    {
+      "epoch": 0.192009600480024,
+      "grad_norm": 0.8017128705978394,
+      "learning_rate": 4.040808161632327e-05,
+      "loss": 2.0542,
+      "step": 960
+    },
+    {
+      "epoch": 0.1960098004900245,
+      "grad_norm": 0.5969107747077942,
+      "learning_rate": 4.0208041608321665e-05,
+      "loss": 2.1491,
+      "step": 980
+    },
+    {
+      "epoch": 0.20001000050002501,
+      "grad_norm": 1.0352481603622437,
+      "learning_rate": 4.000800160032007e-05,
+      "loss": 2.0807,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2040102005100255,
+      "grad_norm": 0.6913074254989624,
+      "learning_rate": 3.980796159231847e-05,
+      "loss": 2.1653,
+      "step": 1020
+    },
+    {
+      "epoch": 0.208010400520026,
+      "grad_norm": 1.1356934309005737,
+      "learning_rate": 3.9607921584316864e-05,
+      "loss": 2.0834,
+      "step": 1040
+    },
+    {
+      "epoch": 0.2120106005300265,
+      "grad_norm": 0.7237222790718079,
+      "learning_rate": 3.940788157631526e-05,
+      "loss": 2.0639,
+      "step": 1060
+    },
+    {
+      "epoch": 0.216010800540027,
+      "grad_norm": 0.8379335403442383,
+      "learning_rate": 3.9207841568313666e-05,
+      "loss": 2.0899,
+      "step": 1080
+    },
+    {
+      "epoch": 0.2200110005500275,
+      "grad_norm": 0.8688188791275024,
+      "learning_rate": 3.900780156031206e-05,
+      "loss": 1.9249,
+      "step": 1100
+    },
+    {
+      "epoch": 0.224011200560028,
+      "grad_norm": 0.6886935234069824,
+      "learning_rate": 3.880776155231046e-05,
+      "loss": 2.0017,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2280114005700285,
+      "grad_norm": 0.6852912902832031,
+      "learning_rate": 3.8607721544308865e-05,
+      "loss": 1.9605,
+      "step": 1140
+    },
+    {
+      "epoch": 0.232011600580029,
+      "grad_norm": 1.002159833908081,
+      "learning_rate": 3.840768153630726e-05,
+      "loss": 2.0085,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2360118005900295,
+      "grad_norm": 0.7874559164047241,
+      "learning_rate": 3.8207641528305666e-05,
+      "loss": 2.065,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24001200060003,
+      "grad_norm": 0.7146089673042297,
+      "learning_rate": 3.800760152030406e-05,
+      "loss": 2.0377,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2440122006100305,
+      "grad_norm": 0.7215453386306763,
+      "learning_rate": 3.780756151230246e-05,
+      "loss": 2.0733,
+      "step": 1220
+    },
+    {
+      "epoch": 0.248012400620031,
+      "grad_norm": 0.5989114046096802,
+      "learning_rate": 3.7607521504300865e-05,
+      "loss": 1.8203,
+      "step": 1240
+    },
+    {
+      "epoch": 0.2520126006300315,
+      "grad_norm": 0.854493260383606,
+      "learning_rate": 3.740748149629926e-05,
+      "loss": 2.1168,
+      "step": 1260
+    },
+    {
+      "epoch": 0.256012800640032,
+      "grad_norm": 0.724231481552124,
+      "learning_rate": 3.720744148829766e-05,
+      "loss": 2.1133,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2600130006500325,
+      "grad_norm": 0.6381165385246277,
+      "learning_rate": 3.7007401480296064e-05,
+      "loss": 2.0682,
+      "step": 1300
+    },
+    {
+      "epoch": 0.264013200660033,
+      "grad_norm": 0.6167752742767334,
+      "learning_rate": 3.680736147229446e-05,
+      "loss": 1.9757,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2680134006700335,
+      "grad_norm": 0.8661261796951294,
+      "learning_rate": 3.660732146429286e-05,
+      "loss": 2.0627,
+      "step": 1340
+    },
+    {
+      "epoch": 0.272013600680034,
+      "grad_norm": 0.7343857288360596,
+      "learning_rate": 3.6407281456291256e-05,
+      "loss": 2.0034,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2760138006900345,
+      "grad_norm": 0.622918426990509,
+      "learning_rate": 3.620724144828966e-05,
+      "loss": 2.0564,
+      "step": 1380
+    },
+    {
+      "epoch": 0.280014000700035,
+      "grad_norm": 0.8361110091209412,
+      "learning_rate": 3.600720144028806e-05,
+      "loss": 1.8507,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2840142007100355,
+      "grad_norm": 0.9155957698822021,
+      "learning_rate": 3.5807161432286455e-05,
+      "loss": 2.0867,
+      "step": 1420
+    },
+    {
+      "epoch": 0.288014400720036,
+      "grad_norm": 0.6283161640167236,
+      "learning_rate": 3.560712142428486e-05,
+      "loss": 1.9926,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2920146007300365,
+      "grad_norm": 0.7145543694496155,
+      "learning_rate": 3.540708141628326e-05,
+      "loss": 2.104,
+      "step": 1460
+    },
+    {
+      "epoch": 0.296014800740037,
+      "grad_norm": 0.9636649489402771,
+      "learning_rate": 3.520704140828166e-05,
+      "loss": 2.0415,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3000150007500375,
+      "grad_norm": 0.9389702677726746,
+      "learning_rate": 3.500700140028006e-05,
+      "loss": 2.121,
+      "step": 1500
+    },
+    {
+      "epoch": 0.304015200760038,
+      "grad_norm": 0.639850914478302,
+      "learning_rate": 3.480696139227846e-05,
+      "loss": 2.0687,
+      "step": 1520
+    },
+    {
+      "epoch": 0.3080154007700385,
+      "grad_norm": 0.6090150475502014,
+      "learning_rate": 3.460692138427686e-05,
+      "loss": 2.0475,
+      "step": 1540
+    },
+    {
+      "epoch": 0.312015600780039,
+      "grad_norm": 0.9228300452232361,
+      "learning_rate": 3.4406881376275256e-05,
+      "loss": 1.9342,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3160158007900395,
+      "grad_norm": 0.8331165909767151,
+      "learning_rate": 3.4206841368273654e-05,
+      "loss": 2.0184,
+      "step": 1580
+    },
+    {
+      "epoch": 0.32001600080004,
+      "grad_norm": 0.8827919363975525,
+      "learning_rate": 3.400680136027206e-05,
+      "loss": 1.9501,
+      "step": 1600
+    },
+    {
+      "epoch": 0.3240162008100405,
+      "grad_norm": 0.6547022461891174,
+      "learning_rate": 3.3806761352270455e-05,
+      "loss": 2.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.328016400820041,
+      "grad_norm": 0.7408332228660583,
+      "learning_rate": 3.360672134426885e-05,
+      "loss": 2.0224,
+      "step": 1640
+    },
+    {
+      "epoch": 0.3320166008300415,
+      "grad_norm": 1.0483753681182861,
+      "learning_rate": 3.3406681336267257e-05,
+      "loss": 1.9768,
+      "step": 1660
+    },
+    {
+      "epoch": 0.336016800840042,
+      "grad_norm": 0.8425695300102234,
+      "learning_rate": 3.3206641328265654e-05,
+      "loss": 1.9681,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3400170008500425,
+      "grad_norm": 0.8190167546272278,
+      "learning_rate": 3.300660132026405e-05,
+      "loss": 2.0136,
+      "step": 1700
+    },
+    {
+      "epoch": 0.344017200860043,
+      "grad_norm": 0.8833659887313843,
+      "learning_rate": 3.280656131226245e-05,
+      "loss": 2.0491,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3480174008700435,
+      "grad_norm": 0.7985687851905823,
+      "learning_rate": 3.260652130426086e-05,
+      "loss": 2.0795,
+      "step": 1740
+    },
+    {
+      "epoch": 0.352017600880044,
+      "grad_norm": 0.5869231224060059,
+      "learning_rate": 3.240648129625926e-05,
+      "loss": 1.9394,
+      "step": 1760
+    },
+    {
+      "epoch": 0.3560178008900445,
+      "grad_norm": 0.8566650152206421,
+      "learning_rate": 3.2206441288257654e-05,
+      "loss": 2.1057,
+      "step": 1780
+    },
+    {
+      "epoch": 0.360018000900045,
+      "grad_norm": 0.8803682327270508,
+      "learning_rate": 3.200640128025605e-05,
+      "loss": 2.0077,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3640182009100455,
+      "grad_norm": 0.8509514927864075,
+      "learning_rate": 3.1806361272254456e-05,
+      "loss": 2.0842,
+      "step": 1820
+    },
+    {
+      "epoch": 0.368018400920046,
+      "grad_norm": 0.7994621992111206,
+      "learning_rate": 3.160632126425285e-05,
+      "loss": 1.9552,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3720186009300465,
+      "grad_norm": 0.8077077269554138,
+      "learning_rate": 3.140628125625125e-05,
+      "loss": 1.951,
+      "step": 1860
+    },
+    {
+      "epoch": 0.376018800940047,
+      "grad_norm": 1.0797460079193115,
+      "learning_rate": 3.1206241248249655e-05,
+      "loss": 1.9852,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3800190009500475,
+      "grad_norm": 0.810925304889679,
+      "learning_rate": 3.100620124024805e-05,
+      "loss": 2.0933,
+      "step": 1900
+    },
+    {
+      "epoch": 0.384019200960048,
+      "grad_norm": 0.7675073742866516,
+      "learning_rate": 3.080616123224645e-05,
+      "loss": 2.148,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3880194009700485,
+      "grad_norm": 0.6319103837013245,
+      "learning_rate": 3.0606121224244847e-05,
+      "loss": 2.023,
+      "step": 1940
+    },
+    {
+      "epoch": 0.392019600980049,
+      "grad_norm": 0.9584619402885437,
+      "learning_rate": 3.0406081216243247e-05,
+      "loss": 2.0993,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3960198009900495,
+      "grad_norm": 0.6930970549583435,
+      "learning_rate": 3.0206041208241648e-05,
+      "loss": 2.1479,
+      "step": 1980
+    },
+    {
+      "epoch": 0.40002000100005003,
+      "grad_norm": 0.8483073711395264,
+      "learning_rate": 3.0006001200240045e-05,
+      "loss": 2.0586,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4040202010100505,
+      "grad_norm": 0.7375994324684143,
+      "learning_rate": 2.9805961192238453e-05,
+      "loss": 2.0511,
+      "step": 2020
+    },
+    {
+      "epoch": 0.408020401020051,
+      "grad_norm": 0.5990826487541199,
+      "learning_rate": 2.960592118423685e-05,
+      "loss": 2.0275,
+      "step": 2040
+    },
+    {
+      "epoch": 0.4120206010300515,
+      "grad_norm": 0.7214673757553101,
+      "learning_rate": 2.940588117623525e-05,
+      "loss": 2.0103,
+      "step": 2060
+    },
+    {
+      "epoch": 0.416020801040052,
+      "grad_norm": 0.8104730248451233,
+      "learning_rate": 2.920584116823365e-05,
+      "loss": 2.057,
+      "step": 2080
+    },
+    {
+      "epoch": 0.4200210010500525,
+      "grad_norm": 0.8133126497268677,
+      "learning_rate": 2.900580116023205e-05,
+      "loss": 1.961,
+      "step": 2100
+    },
+    {
+      "epoch": 0.424021201060053,
+      "grad_norm": 0.9787519574165344,
+      "learning_rate": 2.880576115223045e-05,
+      "loss": 1.9955,
+      "step": 2120
+    },
+    {
+      "epoch": 0.4280214010700535,
+      "grad_norm": 0.7598003149032593,
+      "learning_rate": 2.8605721144228847e-05,
+      "loss": 1.9718,
+      "step": 2140
+    },
+    {
+      "epoch": 0.432021601080054,
+      "grad_norm": 0.8683035373687744,
+      "learning_rate": 2.8405681136227248e-05,
+      "loss": 2.1147,
+      "step": 2160
+    },
+    {
+      "epoch": 0.4360218010900545,
+      "grad_norm": 0.6405585408210754,
+      "learning_rate": 2.8205641128225645e-05,
+      "loss": 1.9286,
+      "step": 2180
+    },
+    {
+      "epoch": 0.440022001100055,
+      "grad_norm": 0.7660755515098572,
+      "learning_rate": 2.8005601120224046e-05,
+      "loss": 2.0977,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4440222011100555,
+      "grad_norm": 0.8197988271713257,
+      "learning_rate": 2.7805561112222443e-05,
+      "loss": 2.0483,
+      "step": 2220
+    },
+    {
+      "epoch": 0.448022401120056,
+      "grad_norm": 0.794974148273468,
+      "learning_rate": 2.7605521104220844e-05,
+      "loss": 2.0287,
+      "step": 2240
+    },
+    {
+      "epoch": 0.4520226011300565,
+      "grad_norm": 0.8480708003044128,
+      "learning_rate": 2.7405481096219245e-05,
+      "loss": 2.1535,
+      "step": 2260
+    },
+    {
+      "epoch": 0.456022801140057,
+      "grad_norm": 0.7189005613327026,
+      "learning_rate": 2.7205441088217642e-05,
+      "loss": 2.0849,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4600230011500575,
+      "grad_norm": 0.684190571308136,
+      "learning_rate": 2.7005401080216043e-05,
+      "loss": 2.0639,
+      "step": 2300
+    },
+    {
+      "epoch": 0.464023201160058,
+      "grad_norm": 0.6512330770492554,
+      "learning_rate": 2.6805361072214447e-05,
+      "loss": 2.0349,
+      "step": 2320
+    },
+    {
+      "epoch": 0.46802340117005853,
+      "grad_norm": 0.6493174433708191,
+      "learning_rate": 2.6605321064212844e-05,
+      "loss": 2.0365,
+      "step": 2340
+    },
+    {
+      "epoch": 0.472023601180059,
+      "grad_norm": 0.6828829646110535,
+      "learning_rate": 2.6405281056211245e-05,
+      "loss": 1.9618,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4760238011900595,
+      "grad_norm": 0.8187217712402344,
+      "learning_rate": 2.6205241048209646e-05,
+      "loss": 1.9968,
+      "step": 2380
+    },
+    {
+      "epoch": 0.48002400120006,
+      "grad_norm": 0.8556678295135498,
+      "learning_rate": 2.6005201040208043e-05,
+      "loss": 1.9587,
+      "step": 2400
+    },
+    {
+      "epoch": 0.4840242012100605,
+      "grad_norm": 0.7501711249351501,
+      "learning_rate": 2.5805161032206444e-05,
+      "loss": 1.9828,
+      "step": 2420
+    },
+    {
+      "epoch": 0.488024401220061,
+      "grad_norm": 0.8156343102455139,
+      "learning_rate": 2.560512102420484e-05,
+      "loss": 2.1709,
+      "step": 2440
+    },
+    {
+      "epoch": 0.4920246012300615,
+      "grad_norm": 0.7879245281219482,
+      "learning_rate": 2.5405081016203242e-05,
+      "loss": 1.9537,
+      "step": 2460
+    },
+    {
+      "epoch": 0.496024801240062,
+      "grad_norm": 0.8421228528022766,
+      "learning_rate": 2.520504100820164e-05,
+      "loss": 1.9807,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5000250012500626,
+      "grad_norm": 0.7792238593101501,
+      "learning_rate": 2.500500100020004e-05,
+      "loss": 2.0446,
+      "step": 2500
+    },
+    {
+      "epoch": 0.504025201260063,
+      "grad_norm": 0.9213767051696777,
+      "learning_rate": 2.480496099219844e-05,
+      "loss": 2.0424,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5080254012700635,
+      "grad_norm": 0.8006434440612793,
+      "learning_rate": 2.460492098419684e-05,
+      "loss": 2.0331,
+      "step": 2540
+    },
+    {
+      "epoch": 0.512025601280064,
+      "grad_norm": 0.5892564654350281,
+      "learning_rate": 2.4404880976195242e-05,
+      "loss": 1.9761,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5160258012900645,
+      "grad_norm": 0.6530196070671082,
+      "learning_rate": 2.420484096819364e-05,
+      "loss": 2.1553,
+      "step": 2580
+    },
+    {
+      "epoch": 0.520026001300065,
+      "grad_norm": 0.6981936097145081,
+      "learning_rate": 2.400480096019204e-05,
+      "loss": 2.0055,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5240262013100655,
+      "grad_norm": 0.7920117378234863,
+      "learning_rate": 2.3804760952190438e-05,
+      "loss": 2.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.528026401320066,
+      "grad_norm": 0.9595440626144409,
+      "learning_rate": 2.360472094418884e-05,
+      "loss": 2.0037,
+      "step": 2640
+    },
+    {
+      "epoch": 0.5320266013300665,
+      "grad_norm": 0.8818207383155823,
+      "learning_rate": 2.340468093618724e-05,
+      "loss": 2.0622,
+      "step": 2660
+    },
+    {
+      "epoch": 0.536026801340067,
+      "grad_norm": 0.8845741748809814,
+      "learning_rate": 2.320464092818564e-05,
+      "loss": 2.0308,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5400270013500675,
+      "grad_norm": 0.936955451965332,
+      "learning_rate": 2.3004600920184037e-05,
+      "loss": 2.0451,
+      "step": 2700
+    },
+    {
+      "epoch": 0.544027201360068,
+      "grad_norm": 0.7245746850967407,
+      "learning_rate": 2.2804560912182438e-05,
+      "loss": 2.0061,
+      "step": 2720
+    },
+    {
+      "epoch": 0.5480274013700686,
+      "grad_norm": 0.6587778329849243,
+      "learning_rate": 2.2604520904180835e-05,
+      "loss": 2.0694,
+      "step": 2740
+    },
+    {
+      "epoch": 0.552027601380069,
+      "grad_norm": 0.7373849749565125,
+      "learning_rate": 2.2404480896179236e-05,
+      "loss": 1.9313,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5560278013900695,
+      "grad_norm": 0.724927544593811,
+      "learning_rate": 2.2204440888177637e-05,
+      "loss": 1.9467,
+      "step": 2780
+    },
+    {
+      "epoch": 0.56002800140007,
+      "grad_norm": 1.0421233177185059,
+      "learning_rate": 2.2004400880176038e-05,
+      "loss": 1.9116,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5640282014100705,
+      "grad_norm": 1.2610092163085938,
+      "learning_rate": 2.180436087217444e-05,
+      "loss": 1.8987,
+      "step": 2820
+    },
+    {
+      "epoch": 0.568028401420071,
+      "grad_norm": 1.0684915781021118,
+      "learning_rate": 2.1604320864172836e-05,
+      "loss": 1.9541,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5720286014300715,
+      "grad_norm": 0.7997428178787231,
+      "learning_rate": 2.1404280856171236e-05,
+      "loss": 1.927,
+      "step": 2860
+    },
+    {
+      "epoch": 0.576028801440072,
+      "grad_norm": 0.829437792301178,
+      "learning_rate": 2.1204240848169634e-05,
+      "loss": 2.0281,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5800290014500725,
+      "grad_norm": 0.8651343584060669,
+      "learning_rate": 2.1004200840168034e-05,
+      "loss": 2.1478,
+      "step": 2900
+    },
+    {
+      "epoch": 0.584029201460073,
+      "grad_norm": 0.7608447670936584,
+      "learning_rate": 2.0804160832166432e-05,
+      "loss": 1.9435,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5880294014700735,
+      "grad_norm": 0.7536948323249817,
+      "learning_rate": 2.0604120824164833e-05,
+      "loss": 1.9884,
+      "step": 2940
+    },
+    {
+      "epoch": 0.592029601480074,
+      "grad_norm": 1.0718867778778076,
+      "learning_rate": 2.0404080816163233e-05,
+      "loss": 2.0581,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5960298014900745,
+      "grad_norm": 0.8238010406494141,
+      "learning_rate": 2.0204040808161634e-05,
+      "loss": 1.9946,
+      "step": 2980
+    },
+    {
+      "epoch": 0.600030001500075,
+      "grad_norm": 0.7035810947418213,
+      "learning_rate": 2.0004000800160035e-05,
+      "loss": 2.033,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6040302015100755,
+      "grad_norm": 1.0067816972732544,
+      "learning_rate": 1.9803960792158432e-05,
+      "loss": 2.0053,
+      "step": 3020
+    },
+    {
+      "epoch": 0.608030401520076,
+      "grad_norm": 0.8643906116485596,
+      "learning_rate": 1.9603920784156833e-05,
+      "loss": 2.0026,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6120306015300765,
+      "grad_norm": 1.065471887588501,
+      "learning_rate": 1.940388077615523e-05,
+      "loss": 2.1586,
+      "step": 3060
+    },
+    {
+      "epoch": 0.616030801540077,
+      "grad_norm": 0.6762551069259644,
+      "learning_rate": 1.920384076815363e-05,
+      "loss": 1.9822,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6200310015500775,
+      "grad_norm": 1.3541420698165894,
+      "learning_rate": 1.900380076015203e-05,
+      "loss": 2.0724,
+      "step": 3100
+    },
+    {
+      "epoch": 0.624031201560078,
+      "grad_norm": 0.6772061586380005,
+      "learning_rate": 1.8803760752150432e-05,
+      "loss": 2.0198,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6280314015700785,
+      "grad_norm": 0.753608226776123,
+      "learning_rate": 1.860372074414883e-05,
+      "loss": 1.9967,
+      "step": 3140
+    },
+    {
+      "epoch": 0.632031601580079,
+      "grad_norm": 0.721973717212677,
+      "learning_rate": 1.840368073614723e-05,
+      "loss": 2.0109,
+      "step": 3160
+    },
+    {
+      "epoch": 0.6360318015900795,
+      "grad_norm": 0.7662345767021179,
+      "learning_rate": 1.8203640728145628e-05,
+      "loss": 2.1131,
+      "step": 3180
+    },
+    {
+      "epoch": 0.64003200160008,
+      "grad_norm": 0.7523438334465027,
+      "learning_rate": 1.800360072014403e-05,
+      "loss": 1.8991,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6440322016100805,
+      "grad_norm": 0.8926076889038086,
+      "learning_rate": 1.780356071214243e-05,
+      "loss": 1.8955,
+      "step": 3220
+    },
+    {
+      "epoch": 0.648032401620081,
+      "grad_norm": 0.8007706999778748,
+      "learning_rate": 1.760352070414083e-05,
+      "loss": 1.9697,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6520326016300815,
+      "grad_norm": 0.9090889692306519,
+      "learning_rate": 1.740348069613923e-05,
+      "loss": 1.9129,
+      "step": 3260
+    },
+    {
+      "epoch": 0.656032801640082,
+      "grad_norm": 0.815657377243042,
+      "learning_rate": 1.7203440688137628e-05,
+      "loss": 1.8882,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6600330016500825,
+      "grad_norm": 0.7561853528022766,
+      "learning_rate": 1.700340068013603e-05,
+      "loss": 2.0443,
+      "step": 3300
+    },
+    {
+      "epoch": 0.664033201660083,
+      "grad_norm": 0.6872078776359558,
+      "learning_rate": 1.6813362672534507e-05,
+      "loss": 2.0211,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6680334016700835,
+      "grad_norm": 0.70289146900177,
+      "learning_rate": 1.6613322664532908e-05,
+      "loss": 1.9342,
+      "step": 3340
+    },
+    {
+      "epoch": 0.672033601680084,
+      "grad_norm": 0.7171549201011658,
+      "learning_rate": 1.6413282656531305e-05,
+      "loss": 2.0781,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6760338016900845,
+      "grad_norm": 1.1851857900619507,
+      "learning_rate": 1.6213242648529706e-05,
+      "loss": 1.9866,
+      "step": 3380
+    },
+    {
+      "epoch": 0.680034001700085,
+      "grad_norm": 0.7133464217185974,
+      "learning_rate": 1.6013202640528107e-05,
+      "loss": 1.9788,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6840342017100856,
+      "grad_norm": 0.8273634910583496,
+      "learning_rate": 1.5813162632526508e-05,
+      "loss": 2.0872,
+      "step": 3420
+    },
+    {
+      "epoch": 0.688034401720086,
+      "grad_norm": 0.6994153261184692,
+      "learning_rate": 1.5613122624524905e-05,
+      "loss": 2.0161,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6920346017300865,
+      "grad_norm": 0.6677341461181641,
+      "learning_rate": 1.5413082616523306e-05,
+      "loss": 1.8846,
+      "step": 3460
+    },
+    {
+      "epoch": 0.696034801740087,
+      "grad_norm": 0.7772448062896729,
+      "learning_rate": 1.5213042608521705e-05,
+      "loss": 1.9809,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7000350017500875,
+      "grad_norm": 0.7587478160858154,
+      "learning_rate": 1.5013002600520104e-05,
+      "loss": 1.9299,
+      "step": 3500
+    },
+    {
+      "epoch": 0.704035201760088,
+      "grad_norm": 0.625851035118103,
+      "learning_rate": 1.4812962592518503e-05,
+      "loss": 1.9841,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7080354017700885,
+      "grad_norm": 0.7695503234863281,
+      "learning_rate": 1.4612922584516905e-05,
+      "loss": 1.9903,
+      "step": 3540
+    },
+    {
+      "epoch": 0.712035601780089,
+      "grad_norm": 1.4897569417953491,
+      "learning_rate": 1.4412882576515304e-05,
+      "loss": 2.1432,
+      "step": 3560
+    },
+    {
+      "epoch": 0.7160358017900895,
+      "grad_norm": 1.0326311588287354,
+      "learning_rate": 1.4212842568513703e-05,
+      "loss": 1.9147,
+      "step": 3580
+    },
+    {
+      "epoch": 0.72003600180009,
+      "grad_norm": 0.7355366349220276,
+      "learning_rate": 1.4012802560512102e-05,
+      "loss": 2.0576,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7240362018100905,
+      "grad_norm": 0.6938571929931641,
+      "learning_rate": 1.3812762552510503e-05,
+      "loss": 2.054,
+      "step": 3620
+    },
+    {
+      "epoch": 0.728036401820091,
+      "grad_norm": 0.6852160096168518,
+      "learning_rate": 1.3612722544508902e-05,
+      "loss": 1.9193,
+      "step": 3640
+    },
+    {
+      "epoch": 0.7320366018300916,
+      "grad_norm": 0.7561490535736084,
+      "learning_rate": 1.3412682536507301e-05,
+      "loss": 2.0894,
+      "step": 3660
+    },
+    {
+      "epoch": 0.736036801840092,
+      "grad_norm": 0.6602606773376465,
+      "learning_rate": 1.32126425285057e-05,
+      "loss": 1.9736,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7400370018500925,
+      "grad_norm": 0.7436513900756836,
+      "learning_rate": 1.3012602520504103e-05,
+      "loss": 1.9555,
+      "step": 3700
+    },
+    {
+      "epoch": 0.744037201860093,
+      "grad_norm": 1.3854832649230957,
+      "learning_rate": 1.2812562512502502e-05,
+      "loss": 2.0192,
+      "step": 3720
+    },
+    {
+      "epoch": 0.7480374018700935,
+      "grad_norm": 0.6004545092582703,
+      "learning_rate": 1.26125225045009e-05,
+      "loss": 1.983,
+      "step": 3740
+    },
+    {
+      "epoch": 0.752037601880094,
+      "grad_norm": 0.7762808203697205,
+      "learning_rate": 1.24124824964993e-05,
+      "loss": 2.0417,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7560378018900945,
+      "grad_norm": 0.8214603662490845,
+      "learning_rate": 1.2212442488497699e-05,
+      "loss": 1.9654,
+      "step": 3780
+    },
+    {
+      "epoch": 0.760038001900095,
+      "grad_norm": 0.5759713649749756,
+      "learning_rate": 1.20124024804961e-05,
+      "loss": 1.9993,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7640382019100955,
+      "grad_norm": 0.8434195518493652,
+      "learning_rate": 1.1812362472494499e-05,
+      "loss": 2.136,
+      "step": 3820
+    },
+    {
+      "epoch": 0.768038401920096,
+      "grad_norm": 0.8448805212974548,
+      "learning_rate": 1.16123224644929e-05,
+      "loss": 2.0152,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7720386019300965,
+      "grad_norm": 0.6562586426734924,
+      "learning_rate": 1.14122824564913e-05,
+      "loss": 1.8749,
+      "step": 3860
+    },
+    {
+      "epoch": 0.776038801940097,
+      "grad_norm": 1.0431329011917114,
+      "learning_rate": 1.12122424484897e-05,
+      "loss": 1.9336,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7800390019500975,
+      "grad_norm": 0.6854636669158936,
+      "learning_rate": 1.1012202440488098e-05,
+      "loss": 2.0081,
+      "step": 3900
+    },
+    {
+      "epoch": 0.784039201960098,
+      "grad_norm": 0.8114478588104248,
+      "learning_rate": 1.0812162432486497e-05,
+      "loss": 2.1009,
+      "step": 3920
+    },
+    {
+      "epoch": 0.7880394019700985,
+      "grad_norm": 0.8008230924606323,
+      "learning_rate": 1.0612122424484898e-05,
+      "loss": 2.004,
+      "step": 3940
+    },
+    {
+      "epoch": 0.792039601980099,
+      "grad_norm": 0.9538334012031555,
+      "learning_rate": 1.0412082416483297e-05,
+      "loss": 1.9979,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7960398019900995,
+      "grad_norm": 0.7155557870864868,
+      "learning_rate": 1.0212042408481696e-05,
+      "loss": 1.9874,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8000400020001001,
+      "grad_norm": 0.8598875403404236,
+      "learning_rate": 1.0012002400480097e-05,
+      "loss": 2.0835,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8040402020101005,
+      "grad_norm": 0.7795354723930359,
+      "learning_rate": 9.811962392478496e-06,
+      "loss": 2.0461,
+      "step": 4020
+    },
+    {
+      "epoch": 0.808040402020101,
+      "grad_norm": 0.9291620254516602,
+      "learning_rate": 9.611922384476895e-06,
+      "loss": 1.9119,
+      "step": 4040
+    },
+    {
+      "epoch": 0.8120406020301015,
+      "grad_norm": 1.3040659427642822,
+      "learning_rate": 9.411882376475296e-06,
+      "loss": 2.0027,
+      "step": 4060
+    },
+    {
+      "epoch": 0.816040802040102,
+      "grad_norm": 0.9256519079208374,
+      "learning_rate": 9.211842368473696e-06,
+      "loss": 2.0155,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8200410020501026,
+      "grad_norm": 0.8110265731811523,
+      "learning_rate": 9.011802360472095e-06,
+      "loss": 2.0509,
+      "step": 4100
+    },
+    {
+      "epoch": 0.824041202060103,
+      "grad_norm": 0.8365888595581055,
+      "learning_rate": 8.811762352470494e-06,
+      "loss": 2.0216,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8280414020701035,
+      "grad_norm": 0.7588245868682861,
+      "learning_rate": 8.611722344468894e-06,
+      "loss": 2.068,
+      "step": 4140
+    },
+    {
+      "epoch": 0.832041602080104,
+      "grad_norm": 0.7555562853813171,
+      "learning_rate": 8.411682336467294e-06,
+      "loss": 2.0204,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8360418020901045,
+      "grad_norm": 0.7847601771354675,
+      "learning_rate": 8.211642328465693e-06,
+      "loss": 2.0697,
+      "step": 4180
+    },
+    {
+      "epoch": 0.840042002100105,
+      "grad_norm": 0.739717960357666,
+      "learning_rate": 8.011602320464092e-06,
+      "loss": 2.0007,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8440422021101055,
+      "grad_norm": 0.7168034911155701,
+      "learning_rate": 7.811562312462493e-06,
+      "loss": 2.0782,
+      "step": 4220
+    },
+    {
+      "epoch": 0.848042402120106,
+      "grad_norm": 0.9154016375541687,
+      "learning_rate": 7.611522304460893e-06,
+      "loss": 2.1683,
+      "step": 4240
+    },
+    {
+      "epoch": 0.8520426021301065,
+      "grad_norm": 0.7498438954353333,
+      "learning_rate": 7.411482296459292e-06,
+      "loss": 2.0312,
+      "step": 4260
+    },
+    {
+      "epoch": 0.856042802140107,
+      "grad_norm": 0.7634301781654358,
+      "learning_rate": 7.211442288457691e-06,
+      "loss": 2.0329,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8600430021501075,
+      "grad_norm": 0.7307527661323547,
+      "learning_rate": 7.011402280456092e-06,
+      "loss": 1.9397,
+      "step": 4300
+    },
+    {
+      "epoch": 0.864043202160108,
+      "grad_norm": 0.8496876955032349,
+      "learning_rate": 6.811362272454491e-06,
+      "loss": 1.9961,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8680434021701086,
+      "grad_norm": 0.655273973941803,
+      "learning_rate": 6.611322264452891e-06,
+      "loss": 1.9959,
+      "step": 4340
+    },
+    {
+      "epoch": 0.872043602180109,
+      "grad_norm": 0.9990720152854919,
+      "learning_rate": 6.4112822564512915e-06,
+      "loss": 2.0527,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8760438021901095,
+      "grad_norm": 0.7674184441566467,
+      "learning_rate": 6.2112422484496905e-06,
+      "loss": 1.8886,
+      "step": 4380
+    },
+    {
+      "epoch": 0.88004400220011,
+      "grad_norm": 0.6465392112731934,
+      "learning_rate": 6.0112022404480895e-06,
+      "loss": 2.0483,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8840442022101105,
+      "grad_norm": 0.7932650446891785,
+      "learning_rate": 5.8111622324464894e-06,
+      "loss": 1.9573,
+      "step": 4420
+    },
+    {
+      "epoch": 0.888044402220111,
+      "grad_norm": 0.9597405195236206,
+      "learning_rate": 5.611122224444889e-06,
+      "loss": 2.0108,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8920446022301115,
+      "grad_norm": 0.9801364541053772,
+      "learning_rate": 5.411082216443289e-06,
+      "loss": 2.094,
+      "step": 4460
+    },
+    {
+      "epoch": 0.896044802240112,
+      "grad_norm": 0.7602345943450928,
+      "learning_rate": 5.211042208441689e-06,
+      "loss": 2.0018,
+      "step": 4480
+    },
+    {
+      "epoch": 0.9000450022501125,
+      "grad_norm": 0.7290012836456299,
+      "learning_rate": 5.011002200440088e-06,
+      "loss": 2.0802,
+      "step": 4500
+    },
+    {
+      "epoch": 0.904045202260113,
+      "grad_norm": 0.7639509439468384,
+      "learning_rate": 4.810962192438488e-06,
+      "loss": 1.9796,
+      "step": 4520
+    },
+    {
+      "epoch": 0.9080454022701135,
+      "grad_norm": 0.8385800123214722,
+      "learning_rate": 4.610922184436887e-06,
+      "loss": 1.9641,
+      "step": 4540
+    },
+    {
+      "epoch": 0.912045602280114,
+      "grad_norm": 1.0409960746765137,
+      "learning_rate": 4.410882176435288e-06,
+      "loss": 2.0136,
+      "step": 4560
+    },
+    {
+      "epoch": 0.9160458022901145,
+      "grad_norm": 0.8607903122901917,
+      "learning_rate": 4.210842168433687e-06,
+      "loss": 2.0872,
+      "step": 4580
+    },
+    {
+      "epoch": 0.920046002300115,
+      "grad_norm": 0.8632460832595825,
+      "learning_rate": 4.010802160432087e-06,
+      "loss": 2.0332,
+      "step": 4600
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 4999,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.9585013825536e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-4600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55cd6babe646b8732ea46972b956ce2974c86621f5e41ed48376c3798e1d5d2c
+size 5048

checkpoint-4800/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/phi-3-mini-4k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-4800/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_up_proj",
+    "qkv_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-4800/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14ac4ee9c44161210d77729d5ab5db295a1b7acca75cb951170f9b9f30069311
+size 443717160

checkpoint-4800/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

checkpoint-4800/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8756c3d7890b46089228e275c111eb19e39c6f9fb81341fe1773aa4f11147a
+size 100878458

checkpoint-4800/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba5fd79c110a3f981b42d0688f49642f0de5769b44de43b8749256746c9550c3
+size 14244

checkpoint-4800/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17c50c81ceb7b90f69e36a70e48a516dfc655be9ca08f569c64c0ce53203c046
+size 1064

checkpoint-4800/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-4800/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-4800/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-4800/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-4800/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1713 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.96004800240012,
+  "eval_steps": 500,
+  "global_step": 4800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0040002000100005,
+      "grad_norm": 0.8278714418411255,
+      "learning_rate": 4.980996199239848e-05,
+      "loss": 2.8118,
+      "step": 20
+    },
+    {
+      "epoch": 0.008000400020001,
+      "grad_norm": 1.106671690940857,
+      "learning_rate": 4.960992198439688e-05,
+      "loss": 2.3191,
+      "step": 40
+    },
+    {
+      "epoch": 0.0120006000300015,
+      "grad_norm": 0.8004185557365417,
+      "learning_rate": 4.9409881976395286e-05,
+      "loss": 2.0999,
+      "step": 60
+    },
+    {
+      "epoch": 0.016000800040002,
+      "grad_norm": 0.8842968344688416,
+      "learning_rate": 4.920984196839368e-05,
+      "loss": 2.0641,
+      "step": 80
+    },
+    {
+      "epoch": 0.0200010000500025,
+      "grad_norm": 0.7633001208305359,
+      "learning_rate": 4.900980196039208e-05,
+      "loss": 2.122,
+      "step": 100
+    },
+    {
+      "epoch": 0.024001200060003,
+      "grad_norm": 1.039692759513855,
+      "learning_rate": 4.8809761952390485e-05,
+      "loss": 1.9996,
+      "step": 120
+    },
+    {
+      "epoch": 0.0280014000700035,
+      "grad_norm": 0.6684374809265137,
+      "learning_rate": 4.860972194438888e-05,
+      "loss": 2.1058,
+      "step": 140
+    },
+    {
+      "epoch": 0.032001600080004,
+      "grad_norm": 0.6902151107788086,
+      "learning_rate": 4.840968193638728e-05,
+      "loss": 2.1403,
+      "step": 160
+    },
+    {
+      "epoch": 0.0360018000900045,
+      "grad_norm": 0.6190841794013977,
+      "learning_rate": 4.820964192838568e-05,
+      "loss": 2.0701,
+      "step": 180
+    },
+    {
+      "epoch": 0.040002000100005,
+      "grad_norm": 0.6660407781600952,
+      "learning_rate": 4.800960192038408e-05,
+      "loss": 2.1249,
+      "step": 200
+    },
+    {
+      "epoch": 0.0440022001100055,
+      "grad_norm": 0.8184217214584351,
+      "learning_rate": 4.780956191238248e-05,
+      "loss": 1.9919,
+      "step": 220
+    },
+    {
+      "epoch": 0.048002400120006,
+      "grad_norm": 0.8200207352638245,
+      "learning_rate": 4.7609521904380875e-05,
+      "loss": 2.0187,
+      "step": 240
+    },
+    {
+      "epoch": 0.0520026001300065,
+      "grad_norm": 0.731918215751648,
+      "learning_rate": 4.740948189637928e-05,
+      "loss": 1.9556,
+      "step": 260
+    },
+    {
+      "epoch": 0.056002800140007,
+      "grad_norm": 0.839863121509552,
+      "learning_rate": 4.720944188837768e-05,
+      "loss": 2.0764,
+      "step": 280
+    },
+    {
+      "epoch": 0.0600030001500075,
+      "grad_norm": 0.6660728454589844,
+      "learning_rate": 4.7009401880376074e-05,
+      "loss": 1.9794,
+      "step": 300
+    },
+    {
+      "epoch": 0.064003200160008,
+      "grad_norm": 0.880646824836731,
+      "learning_rate": 4.680936187237448e-05,
+      "loss": 2.0677,
+      "step": 320
+    },
+    {
+      "epoch": 0.0680034001700085,
+      "grad_norm": 0.6575286984443665,
+      "learning_rate": 4.6609321864372876e-05,
+      "loss": 2.2138,
+      "step": 340
+    },
+    {
+      "epoch": 0.072003600180009,
+      "grad_norm": 0.7487124800682068,
+      "learning_rate": 4.640928185637128e-05,
+      "loss": 2.1787,
+      "step": 360
+    },
+    {
+      "epoch": 0.0760038001900095,
+      "grad_norm": 0.7617707252502441,
+      "learning_rate": 4.620924184836968e-05,
+      "loss": 2.0341,
+      "step": 380
+    },
+    {
+      "epoch": 0.08000400020001,
+      "grad_norm": 0.7258076667785645,
+      "learning_rate": 4.6009201840368075e-05,
+      "loss": 2.2021,
+      "step": 400
+    },
+    {
+      "epoch": 0.0840042002100105,
+      "grad_norm": 1.1093262434005737,
+      "learning_rate": 4.580916183236648e-05,
+      "loss": 2.1792,
+      "step": 420
+    },
+    {
+      "epoch": 0.088004400220011,
+      "grad_norm": 0.6948565244674683,
+      "learning_rate": 4.5609121824364876e-05,
+      "loss": 1.9883,
+      "step": 440
+    },
+    {
+      "epoch": 0.0920046002300115,
+      "grad_norm": 0.8213192820549011,
+      "learning_rate": 4.540908181636327e-05,
+      "loss": 2.08,
+      "step": 460
+    },
+    {
+      "epoch": 0.096004800240012,
+      "grad_norm": 0.6234931349754333,
+      "learning_rate": 4.520904180836167e-05,
+      "loss": 2.0775,
+      "step": 480
+    },
+    {
+      "epoch": 0.10000500025001251,
+      "grad_norm": 0.7921673059463501,
+      "learning_rate": 4.5009001800360075e-05,
+      "loss": 1.9584,
+      "step": 500
+    },
+    {
+      "epoch": 0.104005200260013,
+      "grad_norm": 0.5997710227966309,
+      "learning_rate": 4.480896179235847e-05,
+      "loss": 2.0239,
+      "step": 520
+    },
+    {
+      "epoch": 0.1080054002700135,
+      "grad_norm": 0.6408416032791138,
+      "learning_rate": 4.460892178435687e-05,
+      "loss": 2.0004,
+      "step": 540
+    },
+    {
+      "epoch": 0.112005600280014,
+      "grad_norm": 0.712670087814331,
+      "learning_rate": 4.4408881776355274e-05,
+      "loss": 2.1011,
+      "step": 560
+    },
+    {
+      "epoch": 0.1160058002900145,
+      "grad_norm": 0.8731632828712463,
+      "learning_rate": 4.420884176835367e-05,
+      "loss": 1.9921,
+      "step": 580
+    },
+    {
+      "epoch": 0.120006000300015,
+      "grad_norm": 0.7469409108161926,
+      "learning_rate": 4.4008801760352075e-05,
+      "loss": 1.9548,
+      "step": 600
+    },
+    {
+      "epoch": 0.1240062003100155,
+      "grad_norm": 0.9184134602546692,
+      "learning_rate": 4.380876175235047e-05,
+      "loss": 1.9798,
+      "step": 620
+    },
+    {
+      "epoch": 0.128006400320016,
+      "grad_norm": 0.6513165831565857,
+      "learning_rate": 4.360872174434888e-05,
+      "loss": 1.9918,
+      "step": 640
+    },
+    {
+      "epoch": 0.1320066003300165,
+      "grad_norm": 0.8042064905166626,
+      "learning_rate": 4.3408681736347274e-05,
+      "loss": 2.0268,
+      "step": 660
+    },
+    {
+      "epoch": 0.136006800340017,
+      "grad_norm": 0.6575964093208313,
+      "learning_rate": 4.320864172834567e-05,
+      "loss": 1.9958,
+      "step": 680
+    },
+    {
+      "epoch": 0.1400070003500175,
+      "grad_norm": 0.6663448214530945,
+      "learning_rate": 4.300860172034407e-05,
+      "loss": 2.0274,
+      "step": 700
+    },
+    {
+      "epoch": 0.144007200360018,
+      "grad_norm": 0.6880171298980713,
+      "learning_rate": 4.280856171234247e-05,
+      "loss": 1.9933,
+      "step": 720
+    },
+    {
+      "epoch": 0.1480074003700185,
+      "grad_norm": 1.1512036323547363,
+      "learning_rate": 4.260852170434087e-05,
+      "loss": 2.0047,
+      "step": 740
+    },
+    {
+      "epoch": 0.152007600380019,
+      "grad_norm": 0.9109539985656738,
+      "learning_rate": 4.240848169633927e-05,
+      "loss": 2.0879,
+      "step": 760
+    },
+    {
+      "epoch": 0.1560078003900195,
+      "grad_norm": 0.6704302430152893,
+      "learning_rate": 4.220844168833767e-05,
+      "loss": 2.0508,
+      "step": 780
+    },
+    {
+      "epoch": 0.16000800040002,
+      "grad_norm": 0.5899685621261597,
+      "learning_rate": 4.200840168033607e-05,
+      "loss": 2.1563,
+      "step": 800
+    },
+    {
+      "epoch": 0.1640082004100205,
+      "grad_norm": 0.8644593954086304,
+      "learning_rate": 4.1808361672334466e-05,
+      "loss": 1.9691,
+      "step": 820
+    },
+    {
+      "epoch": 0.168008400420021,
+      "grad_norm": 0.7722281813621521,
+      "learning_rate": 4.1608321664332864e-05,
+      "loss": 2.1047,
+      "step": 840
+    },
+    {
+      "epoch": 0.1720086004300215,
+      "grad_norm": 0.6952481269836426,
+      "learning_rate": 4.140828165633127e-05,
+      "loss": 2.0815,
+      "step": 860
+    },
+    {
+      "epoch": 0.176008800440022,
+      "grad_norm": 0.818111777305603,
+      "learning_rate": 4.1208241648329665e-05,
+      "loss": 2.0609,
+      "step": 880
+    },
+    {
+      "epoch": 0.1800090004500225,
+      "grad_norm": 0.7889490127563477,
+      "learning_rate": 4.100820164032807e-05,
+      "loss": 2.0989,
+      "step": 900
+    },
+    {
+      "epoch": 0.184009200460023,
+      "grad_norm": 0.9778928160667419,
+      "learning_rate": 4.0808161632326467e-05,
+      "loss": 2.0217,
+      "step": 920
+    },
+    {
+      "epoch": 0.1880094004700235,
+      "grad_norm": 0.6434693336486816,
+      "learning_rate": 4.060812162432487e-05,
+      "loss": 1.9841,
+      "step": 940
+    },
+    {
+      "epoch": 0.192009600480024,
+      "grad_norm": 0.8017128705978394,
+      "learning_rate": 4.040808161632327e-05,
+      "loss": 2.0542,
+      "step": 960
+    },
+    {
+      "epoch": 0.1960098004900245,
+      "grad_norm": 0.5969107747077942,
+      "learning_rate": 4.0208041608321665e-05,
+      "loss": 2.1491,
+      "step": 980
+    },
+    {
+      "epoch": 0.20001000050002501,
+      "grad_norm": 1.0352481603622437,
+      "learning_rate": 4.000800160032007e-05,
+      "loss": 2.0807,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2040102005100255,
+      "grad_norm": 0.6913074254989624,
+      "learning_rate": 3.980796159231847e-05,
+      "loss": 2.1653,
+      "step": 1020
+    },
+    {
+      "epoch": 0.208010400520026,
+      "grad_norm": 1.1356934309005737,
+      "learning_rate": 3.9607921584316864e-05,
+      "loss": 2.0834,
+      "step": 1040
+    },
+    {
+      "epoch": 0.2120106005300265,
+      "grad_norm": 0.7237222790718079,
+      "learning_rate": 3.940788157631526e-05,
+      "loss": 2.0639,
+      "step": 1060
+    },
+    {
+      "epoch": 0.216010800540027,
+      "grad_norm": 0.8379335403442383,
+      "learning_rate": 3.9207841568313666e-05,
+      "loss": 2.0899,
+      "step": 1080
+    },
+    {
+      "epoch": 0.2200110005500275,
+      "grad_norm": 0.8688188791275024,
+      "learning_rate": 3.900780156031206e-05,
+      "loss": 1.9249,
+      "step": 1100
+    },
+    {
+      "epoch": 0.224011200560028,
+      "grad_norm": 0.6886935234069824,
+      "learning_rate": 3.880776155231046e-05,
+      "loss": 2.0017,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2280114005700285,
+      "grad_norm": 0.6852912902832031,
+      "learning_rate": 3.8607721544308865e-05,
+      "loss": 1.9605,
+      "step": 1140
+    },
+    {
+      "epoch": 0.232011600580029,
+      "grad_norm": 1.002159833908081,
+      "learning_rate": 3.840768153630726e-05,
+      "loss": 2.0085,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2360118005900295,
+      "grad_norm": 0.7874559164047241,
+      "learning_rate": 3.8207641528305666e-05,
+      "loss": 2.065,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24001200060003,
+      "grad_norm": 0.7146089673042297,
+      "learning_rate": 3.800760152030406e-05,
+      "loss": 2.0377,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2440122006100305,
+      "grad_norm": 0.7215453386306763,
+      "learning_rate": 3.780756151230246e-05,
+      "loss": 2.0733,
+      "step": 1220
+    },
+    {
+      "epoch": 0.248012400620031,
+      "grad_norm": 0.5989114046096802,
+      "learning_rate": 3.7607521504300865e-05,
+      "loss": 1.8203,
+      "step": 1240
+    },
+    {
+      "epoch": 0.2520126006300315,
+      "grad_norm": 0.854493260383606,
+      "learning_rate": 3.740748149629926e-05,
+      "loss": 2.1168,
+      "step": 1260
+    },
+    {
+      "epoch": 0.256012800640032,
+      "grad_norm": 0.724231481552124,
+      "learning_rate": 3.720744148829766e-05,
+      "loss": 2.1133,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2600130006500325,
+      "grad_norm": 0.6381165385246277,
+      "learning_rate": 3.7007401480296064e-05,
+      "loss": 2.0682,
+      "step": 1300
+    },
+    {
+      "epoch": 0.264013200660033,
+      "grad_norm": 0.6167752742767334,
+      "learning_rate": 3.680736147229446e-05,
+      "loss": 1.9757,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2680134006700335,
+      "grad_norm": 0.8661261796951294,
+      "learning_rate": 3.660732146429286e-05,
+      "loss": 2.0627,
+      "step": 1340
+    },
+    {
+      "epoch": 0.272013600680034,
+      "grad_norm": 0.7343857288360596,
+      "learning_rate": 3.6407281456291256e-05,
+      "loss": 2.0034,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2760138006900345,
+      "grad_norm": 0.622918426990509,
+      "learning_rate": 3.620724144828966e-05,
+      "loss": 2.0564,
+      "step": 1380
+    },
+    {
+      "epoch": 0.280014000700035,
+      "grad_norm": 0.8361110091209412,
+      "learning_rate": 3.600720144028806e-05,
+      "loss": 1.8507,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2840142007100355,
+      "grad_norm": 0.9155957698822021,
+      "learning_rate": 3.5807161432286455e-05,
+      "loss": 2.0867,
+      "step": 1420
+    },
+    {
+      "epoch": 0.288014400720036,
+      "grad_norm": 0.6283161640167236,
+      "learning_rate": 3.560712142428486e-05,
+      "loss": 1.9926,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2920146007300365,
+      "grad_norm": 0.7145543694496155,
+      "learning_rate": 3.540708141628326e-05,
+      "loss": 2.104,
+      "step": 1460
+    },
+    {
+      "epoch": 0.296014800740037,
+      "grad_norm": 0.9636649489402771,
+      "learning_rate": 3.520704140828166e-05,
+      "loss": 2.0415,
+      "step": 1480
+    },
+    {
+      "epoch": 0.3000150007500375,
+      "grad_norm": 0.9389702677726746,
+      "learning_rate": 3.500700140028006e-05,
+      "loss": 2.121,
+      "step": 1500
+    },
+    {
+      "epoch": 0.304015200760038,
+      "grad_norm": 0.639850914478302,
+      "learning_rate": 3.480696139227846e-05,
+      "loss": 2.0687,
+      "step": 1520
+    },
+    {
+      "epoch": 0.3080154007700385,
+      "grad_norm": 0.6090150475502014,
+      "learning_rate": 3.460692138427686e-05,
+      "loss": 2.0475,
+      "step": 1540
+    },
+    {
+      "epoch": 0.312015600780039,
+      "grad_norm": 0.9228300452232361,
+      "learning_rate": 3.4406881376275256e-05,
+      "loss": 1.9342,
+      "step": 1560
+    },
+    {
+      "epoch": 0.3160158007900395,
+      "grad_norm": 0.8331165909767151,
+      "learning_rate": 3.4206841368273654e-05,
+      "loss": 2.0184,
+      "step": 1580
+    },
+    {
+      "epoch": 0.32001600080004,
+      "grad_norm": 0.8827919363975525,
+      "learning_rate": 3.400680136027206e-05,
+      "loss": 1.9501,
+      "step": 1600
+    },
+    {
+      "epoch": 0.3240162008100405,
+      "grad_norm": 0.6547022461891174,
+      "learning_rate": 3.3806761352270455e-05,
+      "loss": 2.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.328016400820041,
+      "grad_norm": 0.7408332228660583,
+      "learning_rate": 3.360672134426885e-05,
+      "loss": 2.0224,
+      "step": 1640
+    },
+    {
+      "epoch": 0.3320166008300415,
+      "grad_norm": 1.0483753681182861,
+      "learning_rate": 3.3406681336267257e-05,
+      "loss": 1.9768,
+      "step": 1660
+    },
+    {
+      "epoch": 0.336016800840042,
+      "grad_norm": 0.8425695300102234,
+      "learning_rate": 3.3206641328265654e-05,
+      "loss": 1.9681,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3400170008500425,
+      "grad_norm": 0.8190167546272278,
+      "learning_rate": 3.300660132026405e-05,
+      "loss": 2.0136,
+      "step": 1700
+    },
+    {
+      "epoch": 0.344017200860043,
+      "grad_norm": 0.8833659887313843,
+      "learning_rate": 3.280656131226245e-05,
+      "loss": 2.0491,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3480174008700435,
+      "grad_norm": 0.7985687851905823,
+      "learning_rate": 3.260652130426086e-05,
+      "loss": 2.0795,
+      "step": 1740
+    },
+    {
+      "epoch": 0.352017600880044,
+      "grad_norm": 0.5869231224060059,
+      "learning_rate": 3.240648129625926e-05,
+      "loss": 1.9394,
+      "step": 1760
+    },
+    {
+      "epoch": 0.3560178008900445,
+      "grad_norm": 0.8566650152206421,
+      "learning_rate": 3.2206441288257654e-05,
+      "loss": 2.1057,
+      "step": 1780
+    },
+    {
+      "epoch": 0.360018000900045,
+      "grad_norm": 0.8803682327270508,
+      "learning_rate": 3.200640128025605e-05,
+      "loss": 2.0077,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3640182009100455,
+      "grad_norm": 0.8509514927864075,
+      "learning_rate": 3.1806361272254456e-05,
+      "loss": 2.0842,
+      "step": 1820
+    },
+    {
+      "epoch": 0.368018400920046,
+      "grad_norm": 0.7994621992111206,
+      "learning_rate": 3.160632126425285e-05,
+      "loss": 1.9552,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3720186009300465,
+      "grad_norm": 0.8077077269554138,
+      "learning_rate": 3.140628125625125e-05,
+      "loss": 1.951,
+      "step": 1860
+    },
+    {
+      "epoch": 0.376018800940047,
+      "grad_norm": 1.0797460079193115,
+      "learning_rate": 3.1206241248249655e-05,
+      "loss": 1.9852,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3800190009500475,
+      "grad_norm": 0.810925304889679,
+      "learning_rate": 3.100620124024805e-05,
+      "loss": 2.0933,
+      "step": 1900
+    },
+    {
+      "epoch": 0.384019200960048,
+      "grad_norm": 0.7675073742866516,
+      "learning_rate": 3.080616123224645e-05,
+      "loss": 2.148,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3880194009700485,
+      "grad_norm": 0.6319103837013245,
+      "learning_rate": 3.0606121224244847e-05,
+      "loss": 2.023,
+      "step": 1940
+    },
+    {
+      "epoch": 0.392019600980049,
+      "grad_norm": 0.9584619402885437,
+      "learning_rate": 3.0406081216243247e-05,
+      "loss": 2.0993,
+      "step": 1960
+    },
+    {
+      "epoch": 0.3960198009900495,
+      "grad_norm": 0.6930970549583435,
+      "learning_rate": 3.0206041208241648e-05,
+      "loss": 2.1479,
+      "step": 1980
+    },
+    {
+      "epoch": 0.40002000100005003,
+      "grad_norm": 0.8483073711395264,
+      "learning_rate": 3.0006001200240045e-05,
+      "loss": 2.0586,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4040202010100505,
+      "grad_norm": 0.7375994324684143,
+      "learning_rate": 2.9805961192238453e-05,
+      "loss": 2.0511,
+      "step": 2020
+    },
+    {
+      "epoch": 0.408020401020051,
+      "grad_norm": 0.5990826487541199,
+      "learning_rate": 2.960592118423685e-05,
+      "loss": 2.0275,
+      "step": 2040
+    },
+    {
+      "epoch": 0.4120206010300515,
+      "grad_norm": 0.7214673757553101,
+      "learning_rate": 2.940588117623525e-05,
+      "loss": 2.0103,
+      "step": 2060
+    },
+    {
+      "epoch": 0.416020801040052,
+      "grad_norm": 0.8104730248451233,
+      "learning_rate": 2.920584116823365e-05,
+      "loss": 2.057,
+      "step": 2080
+    },
+    {
+      "epoch": 0.4200210010500525,
+      "grad_norm": 0.8133126497268677,
+      "learning_rate": 2.900580116023205e-05,
+      "loss": 1.961,
+      "step": 2100
+    },
+    {
+      "epoch": 0.424021201060053,
+      "grad_norm": 0.9787519574165344,
+      "learning_rate": 2.880576115223045e-05,
+      "loss": 1.9955,
+      "step": 2120
+    },
+    {
+      "epoch": 0.4280214010700535,
+      "grad_norm": 0.7598003149032593,
+      "learning_rate": 2.8605721144228847e-05,
+      "loss": 1.9718,
+      "step": 2140
+    },
+    {
+      "epoch": 0.432021601080054,
+      "grad_norm": 0.8683035373687744,
+      "learning_rate": 2.8405681136227248e-05,
+      "loss": 2.1147,
+      "step": 2160
+    },
+    {
+      "epoch": 0.4360218010900545,
+      "grad_norm": 0.6405585408210754,
+      "learning_rate": 2.8205641128225645e-05,
+      "loss": 1.9286,
+      "step": 2180
+    },
+    {
+      "epoch": 0.440022001100055,
+      "grad_norm": 0.7660755515098572,
+      "learning_rate": 2.8005601120224046e-05,
+      "loss": 2.0977,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4440222011100555,
+      "grad_norm": 0.8197988271713257,
+      "learning_rate": 2.7805561112222443e-05,
+      "loss": 2.0483,
+      "step": 2220
+    },
+    {
+      "epoch": 0.448022401120056,
+      "grad_norm": 0.794974148273468,
+      "learning_rate": 2.7605521104220844e-05,
+      "loss": 2.0287,
+      "step": 2240
+    },
+    {
+      "epoch": 0.4520226011300565,
+      "grad_norm": 0.8480708003044128,
+      "learning_rate": 2.7405481096219245e-05,
+      "loss": 2.1535,
+      "step": 2260
+    },
+    {
+      "epoch": 0.456022801140057,
+      "grad_norm": 0.7189005613327026,
+      "learning_rate": 2.7205441088217642e-05,
+      "loss": 2.0849,
+      "step": 2280
+    },
+    {
+      "epoch": 0.4600230011500575,
+      "grad_norm": 0.684190571308136,
+      "learning_rate": 2.7005401080216043e-05,
+      "loss": 2.0639,
+      "step": 2300
+    },
+    {
+      "epoch": 0.464023201160058,
+      "grad_norm": 0.6512330770492554,
+      "learning_rate": 2.6805361072214447e-05,
+      "loss": 2.0349,
+      "step": 2320
+    },
+    {
+      "epoch": 0.46802340117005853,
+      "grad_norm": 0.6493174433708191,
+      "learning_rate": 2.6605321064212844e-05,
+      "loss": 2.0365,
+      "step": 2340
+    },
+    {
+      "epoch": 0.472023601180059,
+      "grad_norm": 0.6828829646110535,
+      "learning_rate": 2.6405281056211245e-05,
+      "loss": 1.9618,
+      "step": 2360
+    },
+    {
+      "epoch": 0.4760238011900595,
+      "grad_norm": 0.8187217712402344,
+      "learning_rate": 2.6205241048209646e-05,
+      "loss": 1.9968,
+      "step": 2380
+    },
+    {
+      "epoch": 0.48002400120006,
+      "grad_norm": 0.8556678295135498,
+      "learning_rate": 2.6005201040208043e-05,
+      "loss": 1.9587,
+      "step": 2400
+    },
+    {
+      "epoch": 0.4840242012100605,
+      "grad_norm": 0.7501711249351501,
+      "learning_rate": 2.5805161032206444e-05,
+      "loss": 1.9828,
+      "step": 2420
+    },
+    {
+      "epoch": 0.488024401220061,
+      "grad_norm": 0.8156343102455139,
+      "learning_rate": 2.560512102420484e-05,
+      "loss": 2.1709,
+      "step": 2440
+    },
+    {
+      "epoch": 0.4920246012300615,
+      "grad_norm": 0.7879245281219482,
+      "learning_rate": 2.5405081016203242e-05,
+      "loss": 1.9537,
+      "step": 2460
+    },
+    {
+      "epoch": 0.496024801240062,
+      "grad_norm": 0.8421228528022766,
+      "learning_rate": 2.520504100820164e-05,
+      "loss": 1.9807,
+      "step": 2480
+    },
+    {
+      "epoch": 0.5000250012500626,
+      "grad_norm": 0.7792238593101501,
+      "learning_rate": 2.500500100020004e-05,
+      "loss": 2.0446,
+      "step": 2500
+    },
+    {
+      "epoch": 0.504025201260063,
+      "grad_norm": 0.9213767051696777,
+      "learning_rate": 2.480496099219844e-05,
+      "loss": 2.0424,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5080254012700635,
+      "grad_norm": 0.8006434440612793,
+      "learning_rate": 2.460492098419684e-05,
+      "loss": 2.0331,
+      "step": 2540
+    },
+    {
+      "epoch": 0.512025601280064,
+      "grad_norm": 0.5892564654350281,
+      "learning_rate": 2.4404880976195242e-05,
+      "loss": 1.9761,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5160258012900645,
+      "grad_norm": 0.6530196070671082,
+      "learning_rate": 2.420484096819364e-05,
+      "loss": 2.1553,
+      "step": 2580
+    },
+    {
+      "epoch": 0.520026001300065,
+      "grad_norm": 0.6981936097145081,
+      "learning_rate": 2.400480096019204e-05,
+      "loss": 2.0055,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5240262013100655,
+      "grad_norm": 0.7920117378234863,
+      "learning_rate": 2.3804760952190438e-05,
+      "loss": 2.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.528026401320066,
+      "grad_norm": 0.9595440626144409,
+      "learning_rate": 2.360472094418884e-05,
+      "loss": 2.0037,
+      "step": 2640
+    },
+    {
+      "epoch": 0.5320266013300665,
+      "grad_norm": 0.8818207383155823,
+      "learning_rate": 2.340468093618724e-05,
+      "loss": 2.0622,
+      "step": 2660
+    },
+    {
+      "epoch": 0.536026801340067,
+      "grad_norm": 0.8845741748809814,
+      "learning_rate": 2.320464092818564e-05,
+      "loss": 2.0308,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5400270013500675,
+      "grad_norm": 0.936955451965332,
+      "learning_rate": 2.3004600920184037e-05,
+      "loss": 2.0451,
+      "step": 2700
+    },
+    {
+      "epoch": 0.544027201360068,
+      "grad_norm": 0.7245746850967407,
+      "learning_rate": 2.2804560912182438e-05,
+      "loss": 2.0061,
+      "step": 2720
+    },
+    {
+      "epoch": 0.5480274013700686,
+      "grad_norm": 0.6587778329849243,
+      "learning_rate": 2.2604520904180835e-05,
+      "loss": 2.0694,
+      "step": 2740
+    },
+    {
+      "epoch": 0.552027601380069,
+      "grad_norm": 0.7373849749565125,
+      "learning_rate": 2.2404480896179236e-05,
+      "loss": 1.9313,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5560278013900695,
+      "grad_norm": 0.724927544593811,
+      "learning_rate": 2.2204440888177637e-05,
+      "loss": 1.9467,
+      "step": 2780
+    },
+    {
+      "epoch": 0.56002800140007,
+      "grad_norm": 1.0421233177185059,
+      "learning_rate": 2.2004400880176038e-05,
+      "loss": 1.9116,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5640282014100705,
+      "grad_norm": 1.2610092163085938,
+      "learning_rate": 2.180436087217444e-05,
+      "loss": 1.8987,
+      "step": 2820
+    },
+    {
+      "epoch": 0.568028401420071,
+      "grad_norm": 1.0684915781021118,
+      "learning_rate": 2.1604320864172836e-05,
+      "loss": 1.9541,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5720286014300715,
+      "grad_norm": 0.7997428178787231,
+      "learning_rate": 2.1404280856171236e-05,
+      "loss": 1.927,
+      "step": 2860
+    },
+    {
+      "epoch": 0.576028801440072,
+      "grad_norm": 0.829437792301178,
+      "learning_rate": 2.1204240848169634e-05,
+      "loss": 2.0281,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5800290014500725,
+      "grad_norm": 0.8651343584060669,
+      "learning_rate": 2.1004200840168034e-05,
+      "loss": 2.1478,
+      "step": 2900
+    },
+    {
+      "epoch": 0.584029201460073,
+      "grad_norm": 0.7608447670936584,
+      "learning_rate": 2.0804160832166432e-05,
+      "loss": 1.9435,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5880294014700735,
+      "grad_norm": 0.7536948323249817,
+      "learning_rate": 2.0604120824164833e-05,
+      "loss": 1.9884,
+      "step": 2940
+    },
+    {
+      "epoch": 0.592029601480074,
+      "grad_norm": 1.0718867778778076,
+      "learning_rate": 2.0404080816163233e-05,
+      "loss": 2.0581,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5960298014900745,
+      "grad_norm": 0.8238010406494141,
+      "learning_rate": 2.0204040808161634e-05,
+      "loss": 1.9946,
+      "step": 2980
+    },
+    {
+      "epoch": 0.600030001500075,
+      "grad_norm": 0.7035810947418213,
+      "learning_rate": 2.0004000800160035e-05,
+      "loss": 2.033,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6040302015100755,
+      "grad_norm": 1.0067816972732544,
+      "learning_rate": 1.9803960792158432e-05,
+      "loss": 2.0053,
+      "step": 3020
+    },
+    {
+      "epoch": 0.608030401520076,
+      "grad_norm": 0.8643906116485596,
+      "learning_rate": 1.9603920784156833e-05,
+      "loss": 2.0026,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6120306015300765,
+      "grad_norm": 1.065471887588501,
+      "learning_rate": 1.940388077615523e-05,
+      "loss": 2.1586,
+      "step": 3060
+    },
+    {
+      "epoch": 0.616030801540077,
+      "grad_norm": 0.6762551069259644,
+      "learning_rate": 1.920384076815363e-05,
+      "loss": 1.9822,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6200310015500775,
+      "grad_norm": 1.3541420698165894,
+      "learning_rate": 1.900380076015203e-05,
+      "loss": 2.0724,
+      "step": 3100
+    },
+    {
+      "epoch": 0.624031201560078,
+      "grad_norm": 0.6772061586380005,
+      "learning_rate": 1.8803760752150432e-05,
+      "loss": 2.0198,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6280314015700785,
+      "grad_norm": 0.753608226776123,
+      "learning_rate": 1.860372074414883e-05,
+      "loss": 1.9967,
+      "step": 3140
+    },
+    {
+      "epoch": 0.632031601580079,
+      "grad_norm": 0.721973717212677,
+      "learning_rate": 1.840368073614723e-05,
+      "loss": 2.0109,
+      "step": 3160
+    },
+    {
+      "epoch": 0.6360318015900795,
+      "grad_norm": 0.7662345767021179,
+      "learning_rate": 1.8203640728145628e-05,
+      "loss": 2.1131,
+      "step": 3180
+    },
+    {
+      "epoch": 0.64003200160008,
+      "grad_norm": 0.7523438334465027,
+      "learning_rate": 1.800360072014403e-05,
+      "loss": 1.8991,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6440322016100805,
+      "grad_norm": 0.8926076889038086,
+      "learning_rate": 1.780356071214243e-05,
+      "loss": 1.8955,
+      "step": 3220
+    },
+    {
+      "epoch": 0.648032401620081,
+      "grad_norm": 0.8007706999778748,
+      "learning_rate": 1.760352070414083e-05,
+      "loss": 1.9697,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6520326016300815,
+      "grad_norm": 0.9090889692306519,
+      "learning_rate": 1.740348069613923e-05,
+      "loss": 1.9129,
+      "step": 3260
+    },
+    {
+      "epoch": 0.656032801640082,
+      "grad_norm": 0.815657377243042,
+      "learning_rate": 1.7203440688137628e-05,
+      "loss": 1.8882,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6600330016500825,
+      "grad_norm": 0.7561853528022766,
+      "learning_rate": 1.700340068013603e-05,
+      "loss": 2.0443,
+      "step": 3300
+    },
+    {
+      "epoch": 0.664033201660083,
+      "grad_norm": 0.6872078776359558,
+      "learning_rate": 1.6813362672534507e-05,
+      "loss": 2.0211,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6680334016700835,
+      "grad_norm": 0.70289146900177,
+      "learning_rate": 1.6613322664532908e-05,
+      "loss": 1.9342,
+      "step": 3340
+    },
+    {
+      "epoch": 0.672033601680084,
+      "grad_norm": 0.7171549201011658,
+      "learning_rate": 1.6413282656531305e-05,
+      "loss": 2.0781,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6760338016900845,
+      "grad_norm": 1.1851857900619507,
+      "learning_rate": 1.6213242648529706e-05,
+      "loss": 1.9866,
+      "step": 3380
+    },
+    {
+      "epoch": 0.680034001700085,
+      "grad_norm": 0.7133464217185974,
+      "learning_rate": 1.6013202640528107e-05,
+      "loss": 1.9788,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6840342017100856,
+      "grad_norm": 0.8273634910583496,
+      "learning_rate": 1.5813162632526508e-05,
+      "loss": 2.0872,
+      "step": 3420
+    },
+    {
+      "epoch": 0.688034401720086,
+      "grad_norm": 0.6994153261184692,
+      "learning_rate": 1.5613122624524905e-05,
+      "loss": 2.0161,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6920346017300865,
+      "grad_norm": 0.6677341461181641,
+      "learning_rate": 1.5413082616523306e-05,
+      "loss": 1.8846,
+      "step": 3460
+    },
+    {
+      "epoch": 0.696034801740087,
+      "grad_norm": 0.7772448062896729,
+      "learning_rate": 1.5213042608521705e-05,
+      "loss": 1.9809,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7000350017500875,
+      "grad_norm": 0.7587478160858154,
+      "learning_rate": 1.5013002600520104e-05,
+      "loss": 1.9299,
+      "step": 3500
+    },
+    {
+      "epoch": 0.704035201760088,
+      "grad_norm": 0.625851035118103,
+      "learning_rate": 1.4812962592518503e-05,
+      "loss": 1.9841,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7080354017700885,
+      "grad_norm": 0.7695503234863281,
+      "learning_rate": 1.4612922584516905e-05,
+      "loss": 1.9903,
+      "step": 3540
+    },
+    {
+      "epoch": 0.712035601780089,
+      "grad_norm": 1.4897569417953491,
+      "learning_rate": 1.4412882576515304e-05,
+      "loss": 2.1432,
+      "step": 3560
+    },
+    {
+      "epoch": 0.7160358017900895,
+      "grad_norm": 1.0326311588287354,
+      "learning_rate": 1.4212842568513703e-05,
+      "loss": 1.9147,
+      "step": 3580
+    },
+    {
+      "epoch": 0.72003600180009,
+      "grad_norm": 0.7355366349220276,
+      "learning_rate": 1.4012802560512102e-05,
+      "loss": 2.0576,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7240362018100905,
+      "grad_norm": 0.6938571929931641,
+      "learning_rate": 1.3812762552510503e-05,
+      "loss": 2.054,
+      "step": 3620
+    },
+    {
+      "epoch": 0.728036401820091,
+      "grad_norm": 0.6852160096168518,
+      "learning_rate": 1.3612722544508902e-05,
+      "loss": 1.9193,
+      "step": 3640
+    },
+    {
+      "epoch": 0.7320366018300916,
+      "grad_norm": 0.7561490535736084,
+      "learning_rate": 1.3412682536507301e-05,
+      "loss": 2.0894,
+      "step": 3660
+    },
+    {
+      "epoch": 0.736036801840092,
+      "grad_norm": 0.6602606773376465,
+      "learning_rate": 1.32126425285057e-05,
+      "loss": 1.9736,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7400370018500925,
+      "grad_norm": 0.7436513900756836,
+      "learning_rate": 1.3012602520504103e-05,
+      "loss": 1.9555,
+      "step": 3700
+    },
+    {
+      "epoch": 0.744037201860093,
+      "grad_norm": 1.3854832649230957,
+      "learning_rate": 1.2812562512502502e-05,
+      "loss": 2.0192,
+      "step": 3720
+    },
+    {
+      "epoch": 0.7480374018700935,
+      "grad_norm": 0.6004545092582703,
+      "learning_rate": 1.26125225045009e-05,
+      "loss": 1.983,
+      "step": 3740
+    },
+    {
+      "epoch": 0.752037601880094,
+      "grad_norm": 0.7762808203697205,
+      "learning_rate": 1.24124824964993e-05,
+      "loss": 2.0417,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7560378018900945,
+      "grad_norm": 0.8214603662490845,
+      "learning_rate": 1.2212442488497699e-05,
+      "loss": 1.9654,
+      "step": 3780
+    },
+    {
+      "epoch": 0.760038001900095,
+      "grad_norm": 0.5759713649749756,
+      "learning_rate": 1.20124024804961e-05,
+      "loss": 1.9993,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7640382019100955,
+      "grad_norm": 0.8434195518493652,
+      "learning_rate": 1.1812362472494499e-05,
+      "loss": 2.136,
+      "step": 3820
+    },
+    {
+      "epoch": 0.768038401920096,
+      "grad_norm": 0.8448805212974548,
+      "learning_rate": 1.16123224644929e-05,
+      "loss": 2.0152,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7720386019300965,
+      "grad_norm": 0.6562586426734924,
+      "learning_rate": 1.14122824564913e-05,
+      "loss": 1.8749,
+      "step": 3860
+    },
+    {
+      "epoch": 0.776038801940097,
+      "grad_norm": 1.0431329011917114,
+      "learning_rate": 1.12122424484897e-05,
+      "loss": 1.9336,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7800390019500975,
+      "grad_norm": 0.6854636669158936,
+      "learning_rate": 1.1012202440488098e-05,
+      "loss": 2.0081,
+      "step": 3900
+    },
+    {
+      "epoch": 0.784039201960098,
+      "grad_norm": 0.8114478588104248,
+      "learning_rate": 1.0812162432486497e-05,
+      "loss": 2.1009,
+      "step": 3920
+    },
+    {
+      "epoch": 0.7880394019700985,
+      "grad_norm": 0.8008230924606323,
+      "learning_rate": 1.0612122424484898e-05,
+      "loss": 2.004,
+      "step": 3940
+    },
+    {
+      "epoch": 0.792039601980099,
+      "grad_norm": 0.9538334012031555,
+      "learning_rate": 1.0412082416483297e-05,
+      "loss": 1.9979,
+      "step": 3960
+    },
+    {
+      "epoch": 0.7960398019900995,
+      "grad_norm": 0.7155557870864868,
+      "learning_rate": 1.0212042408481696e-05,
+      "loss": 1.9874,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8000400020001001,
+      "grad_norm": 0.8598875403404236,
+      "learning_rate": 1.0012002400480097e-05,
+      "loss": 2.0835,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8040402020101005,
+      "grad_norm": 0.7795354723930359,
+      "learning_rate": 9.811962392478496e-06,
+      "loss": 2.0461,
+      "step": 4020
+    },
+    {
+      "epoch": 0.808040402020101,
+      "grad_norm": 0.9291620254516602,
+      "learning_rate": 9.611922384476895e-06,
+      "loss": 1.9119,
+      "step": 4040
+    },
+    {
+      "epoch": 0.8120406020301015,
+      "grad_norm": 1.3040659427642822,
+      "learning_rate": 9.411882376475296e-06,
+      "loss": 2.0027,
+      "step": 4060
+    },
+    {
+      "epoch": 0.816040802040102,
+      "grad_norm": 0.9256519079208374,
+      "learning_rate": 9.211842368473696e-06,
+      "loss": 2.0155,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8200410020501026,
+      "grad_norm": 0.8110265731811523,
+      "learning_rate": 9.011802360472095e-06,
+      "loss": 2.0509,
+      "step": 4100
+    },
+    {
+      "epoch": 0.824041202060103,
+      "grad_norm": 0.8365888595581055,
+      "learning_rate": 8.811762352470494e-06,
+      "loss": 2.0216,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8280414020701035,
+      "grad_norm": 0.7588245868682861,
+      "learning_rate": 8.611722344468894e-06,
+      "loss": 2.068,
+      "step": 4140
+    },
+    {
+      "epoch": 0.832041602080104,
+      "grad_norm": 0.7555562853813171,
+      "learning_rate": 8.411682336467294e-06,
+      "loss": 2.0204,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8360418020901045,
+      "grad_norm": 0.7847601771354675,
+      "learning_rate": 8.211642328465693e-06,
+      "loss": 2.0697,
+      "step": 4180
+    },
+    {
+      "epoch": 0.840042002100105,
+      "grad_norm": 0.739717960357666,
+      "learning_rate": 8.011602320464092e-06,
+      "loss": 2.0007,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8440422021101055,
+      "grad_norm": 0.7168034911155701,
+      "learning_rate": 7.811562312462493e-06,
+      "loss": 2.0782,
+      "step": 4220
+    },
+    {
+      "epoch": 0.848042402120106,
+      "grad_norm": 0.9154016375541687,
+      "learning_rate": 7.611522304460893e-06,
+      "loss": 2.1683,
+      "step": 4240
+    },
+    {
+      "epoch": 0.8520426021301065,
+      "grad_norm": 0.7498438954353333,
+      "learning_rate": 7.411482296459292e-06,
+      "loss": 2.0312,
+      "step": 4260
+    },
+    {
+      "epoch": 0.856042802140107,
+      "grad_norm": 0.7634301781654358,
+      "learning_rate": 7.211442288457691e-06,
+      "loss": 2.0329,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8600430021501075,
+      "grad_norm": 0.7307527661323547,
+      "learning_rate": 7.011402280456092e-06,
+      "loss": 1.9397,
+      "step": 4300
+    },
+    {
+      "epoch": 0.864043202160108,
+      "grad_norm": 0.8496876955032349,
+      "learning_rate": 6.811362272454491e-06,
+      "loss": 1.9961,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8680434021701086,
+      "grad_norm": 0.655273973941803,
+      "learning_rate": 6.611322264452891e-06,
+      "loss": 1.9959,
+      "step": 4340
+    },
+    {
+      "epoch": 0.872043602180109,
+      "grad_norm": 0.9990720152854919,
+      "learning_rate": 6.4112822564512915e-06,
+      "loss": 2.0527,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8760438021901095,
+      "grad_norm": 0.7674184441566467,
+      "learning_rate": 6.2112422484496905e-06,
+      "loss": 1.8886,
+      "step": 4380
+    },
+    {
+      "epoch": 0.88004400220011,
+      "grad_norm": 0.6465392112731934,
+      "learning_rate": 6.0112022404480895e-06,
+      "loss": 2.0483,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8840442022101105,
+      "grad_norm": 0.7932650446891785,
+      "learning_rate": 5.8111622324464894e-06,
+      "loss": 1.9573,
+      "step": 4420
+    },
+    {
+      "epoch": 0.888044402220111,
+      "grad_norm": 0.9597405195236206,
+      "learning_rate": 5.611122224444889e-06,
+      "loss": 2.0108,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8920446022301115,
+      "grad_norm": 0.9801364541053772,
+      "learning_rate": 5.411082216443289e-06,
+      "loss": 2.094,
+      "step": 4460
+    },
+    {
+      "epoch": 0.896044802240112,
+      "grad_norm": 0.7602345943450928,
+      "learning_rate": 5.211042208441689e-06,
+      "loss": 2.0018,
+      "step": 4480
+    },
+    {
+      "epoch": 0.9000450022501125,
+      "grad_norm": 0.7290012836456299,
+      "learning_rate": 5.011002200440088e-06,
+      "loss": 2.0802,
+      "step": 4500
+    },
+    {
+      "epoch": 0.904045202260113,
+      "grad_norm": 0.7639509439468384,
+      "learning_rate": 4.810962192438488e-06,
+      "loss": 1.9796,
+      "step": 4520
+    },
+    {
+      "epoch": 0.9080454022701135,
+      "grad_norm": 0.8385800123214722,
+      "learning_rate": 4.610922184436887e-06,
+      "loss": 1.9641,
+      "step": 4540
+    },
+    {
+      "epoch": 0.912045602280114,
+      "grad_norm": 1.0409960746765137,
+      "learning_rate": 4.410882176435288e-06,
+      "loss": 2.0136,
+      "step": 4560
+    },
+    {
+      "epoch": 0.9160458022901145,
+      "grad_norm": 0.8607903122901917,
+      "learning_rate": 4.210842168433687e-06,
+      "loss": 2.0872,
+      "step": 4580
+    },
+    {
+      "epoch": 0.920046002300115,
+      "grad_norm": 0.8632460832595825,
+      "learning_rate": 4.010802160432087e-06,
+      "loss": 2.0332,
+      "step": 4600
+    },
+    {
+      "epoch": 0.9240462023101155,
+      "grad_norm": 0.7640056610107422,
+      "learning_rate": 3.810762152430486e-06,
+      "loss": 2.0735,
+      "step": 4620
+    },
+    {
+      "epoch": 0.928046402320116,
+      "grad_norm": 0.7293581962585449,
+      "learning_rate": 3.610722144428886e-06,
+      "loss": 1.9248,
+      "step": 4640
+    },
+    {
+      "epoch": 0.9320466023301165,
+      "grad_norm": 0.8446735739707947,
+      "learning_rate": 3.410682136427286e-06,
+      "loss": 1.9839,
+      "step": 4660
+    },
+    {
+      "epoch": 0.9360468023401171,
+      "grad_norm": 1.1868308782577515,
+      "learning_rate": 3.210642128425685e-06,
+      "loss": 2.0027,
+      "step": 4680
+    },
+    {
+      "epoch": 0.9400470023501175,
+      "grad_norm": 0.6171759963035583,
+      "learning_rate": 3.010602120424085e-06,
+      "loss": 1.9157,
+      "step": 4700
+    },
+    {
+      "epoch": 0.944047202360118,
+      "grad_norm": 0.618806779384613,
+      "learning_rate": 2.810562112422485e-06,
+      "loss": 1.9603,
+      "step": 4720
+    },
+    {
+      "epoch": 0.9480474023701185,
+      "grad_norm": 0.8312541246414185,
+      "learning_rate": 2.6105221044208843e-06,
+      "loss": 1.9326,
+      "step": 4740
+    },
+    {
+      "epoch": 0.952047602380119,
+      "grad_norm": 0.8903723955154419,
+      "learning_rate": 2.410482096419284e-06,
+      "loss": 2.1221,
+      "step": 4760
+    },
+    {
+      "epoch": 0.9560478023901195,
+      "grad_norm": 0.7693188190460205,
+      "learning_rate": 2.2104420884176836e-06,
+      "loss": 1.96,
+      "step": 4780
+    },
+    {
+      "epoch": 0.96004800240012,
+      "grad_norm": 0.756828248500824,
+      "learning_rate": 2.010402080416083e-06,
+      "loss": 2.1037,
+      "step": 4800
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 4999,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.1306101383168e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-4800/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55cd6babe646b8732ea46972b956ce2974c86621f5e41ed48376c3798e1d5d2c
+size 5048

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55cd6babe646b8732ea46972b956ce2974c86621f5e41ed48376c3798e1d5d2c
+size 5048