ErkaMarka commited on 20 days ago

Commit

856ee30

verified ·

1 Parent(s): fcc65a7

Upload mongolian-mistral-7b-chatbot

Browse files

Files changed (39) hide show

README.md +88 -0
adapter_config.json +46 -0
adapter_model.safetensors +3 -0
added_tokens.json +0 -0
chat_template.jinja +24 -0
checkpoint-8500/README.md +209 -0
checkpoint-8500/adapter_config.json +46 -0
checkpoint-8500/adapter_model.safetensors +3 -0
checkpoint-8500/chat_template.jinja +24 -0
checkpoint-8500/optimizer.pt +3 -0
checkpoint-8500/rng_state.pth +3 -0
checkpoint-8500/scaler.pt +3 -0
checkpoint-8500/scheduler.pt +3 -0
checkpoint-8500/special_tokens_map.json +24 -0
checkpoint-8500/tokenizer.json +0 -0
checkpoint-8500/tokenizer.model +3 -0
checkpoint-8500/tokenizer_config.json +44 -0
checkpoint-8500/trainer_state.json +1360 -0
checkpoint-8500/training_args.bin +3 -0
checkpoint-8892/README.md +209 -0
checkpoint-8892/adapter_config.json +46 -0
checkpoint-8892/adapter_model.safetensors +3 -0
checkpoint-8892/chat_template.jinja +24 -0
checkpoint-8892/optimizer.pt +3 -0
checkpoint-8892/rng_state.pth +3 -0
checkpoint-8892/scaler.pt +3 -0
checkpoint-8892/scheduler.pt +3 -0
checkpoint-8892/special_tokens_map.json +24 -0
checkpoint-8892/tokenizer.json +0 -0
checkpoint-8892/tokenizer.model +3 -0
checkpoint-8892/tokenizer_config.json +44 -0
checkpoint-8892/trainer_state.json +1409 -0
checkpoint-8892/training_args.bin +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0
training_config.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+language:
+- mn
+license: apache-2.0
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+tags:
+- mongolian
+- fine-tuned
+- lora
+- chatbot
+datasets:
+- custom
+---
+# mongolian-mistral-7b-chatbot
+## Description
+Mistral 7B fine-tuned on Mongolian news data for chatbot
+## Model Details
+- **Base Model:** mistralai/Mistral-7B-Instruct-v0.2
+- **Language:** Mongolian (mn)
+- **Fine-tuning Method:** LoRA (Low-Rank Adaptation)
+- **Training Data:** Eduge Mongolian News Dataset (75,000+ articles)
+## Training Configuration
+- **LoRA Rank:** 32
+- **LoRA Alpha:** 64
+- **Epochs:** 3
+- **Learning Rate:** 2e-4
+- **Batch Size:** 4
+- **Max Sequence Length:** 1024
+## Mongolian Tokens Added
+- Total new tokens: ~9,500
+- Sources: Mongolian-NLP repository
+  - Most frequent words
+  - Abbreviations
+  - District/place names
+  - Country names
+  - Named entities (NER)
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("ErkaMarka/mongolian-mistral-7b-chatbot")
+# Load base model
+base_model = AutoModelForCausalLM.from_pretrained(
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+# Resize embeddings for new tokens
+base_model.resize_token_embeddings(len(tokenizer))
+# Load LoRA adapter
+model = PeftModel.from_pretrained(base_model, "ErkaMarka/mongolian-mistral-7b-chatbot")
+# Generate
+messages = [{"role": "user", "content": "Монгол улсын нийслэл хот юу вэ?"}]
+text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=150)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+## Evaluation Results
+Evaluated on 100 Mongolian Q&A pairs using BLEU score.
+## License
+Apache 2.0
+## Citation
+```
+@misc{mongolian_mistral_7b_chatbot},
+  author = {Your Name},
+  title = {mongolian-mistral-7b-chatbot},
+  year = {2024},
+  publisher = {Hugging Face},
+  url = {https://huggingface.co/ErkaMarka/mongolian-mistral-7b-chatbot}
+}
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a1eee9c4e2ea5c1715066b179c6310891607fce306c1fbbcccd5bec9e964658
+size 335604696

added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

checkpoint-8500/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:mistralai/Mistral-7B-Instruct-v0.2
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

checkpoint-8500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-8500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:260a083e78e7446beabba4dc2220c33d1368a3b49255e54036ce6b89f4acc62a
+size 335604696

checkpoint-8500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

checkpoint-8500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c33fe14dbd5003d189d64654761c330120b32218191d14801ade54126b08712
+size 671466706

checkpoint-8500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f87da9eb31a8f186e20c50eff2dab7a1ac22eb3c77f52d0da900c4cb7170c9
+size 14244

checkpoint-8500/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:244785c20c15168a893df15900ee311660c9bceabe3d8c118350af5529973fa6
+size 988

checkpoint-8500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ccfa9c8898c4dd40288d270472ea45c541256a641c9d0960d647c40fba7f444
+size 1064

checkpoint-8500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-8500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-8500/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-8500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-8500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1360 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.867892694449131,
+  "eval_steps": 500,
+  "global_step": 8500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016871941960519655,
+      "grad_norm": 2.0177435874938965,
+      "learning_rate": 3.670411985018727e-05,
+      "loss": 1.9696,
+      "step": 50
+    },
+    {
+      "epoch": 0.03374388392103931,
+      "grad_norm": 1.9107558727264404,
+      "learning_rate": 7.415730337078653e-05,
+      "loss": 1.5446,
+      "step": 100
+    },
+    {
+      "epoch": 0.05061582588155897,
+      "grad_norm": 1.3704568147659302,
+      "learning_rate": 0.00011161048689138578,
+      "loss": 1.3815,
+      "step": 150
+    },
+    {
+      "epoch": 0.06748776784207862,
+      "grad_norm": 1.3545180559158325,
+      "learning_rate": 0.00014906367041198505,
+      "loss": 1.2514,
+      "step": 200
+    },
+    {
+      "epoch": 0.08435970980259828,
+      "grad_norm": 1.2799007892608643,
+      "learning_rate": 0.00018651685393258427,
+      "loss": 1.1922,
+      "step": 250
+    },
+    {
+      "epoch": 0.10123165176311794,
+      "grad_norm": 1.1978015899658203,
+      "learning_rate": 0.0001999932072351269,
+      "loss": 1.1556,
+      "step": 300
+    },
+    {
+      "epoch": 0.11810359372363759,
+      "grad_norm": 0.9664444923400879,
+      "learning_rate": 0.00019995539875714444,
+      "loss": 1.0915,
+      "step": 350
+    },
+    {
+      "epoch": 0.13497553568415724,
+      "grad_norm": 1.0787692070007324,
+      "learning_rate": 0.0001998844378161928,
+      "loss": 1.0562,
+      "step": 400
+    },
+    {
+      "epoch": 0.1518474776446769,
+      "grad_norm": 1.0797227621078491,
+      "learning_rate": 0.00019978034794806892,
+      "loss": 1.0253,
+      "step": 450
+    },
+    {
+      "epoch": 0.16871941960519657,
+      "grad_norm": 0.9610917568206787,
+      "learning_rate": 0.00019964316367652584,
+      "loss": 1.0084,
+      "step": 500
+    },
+    {
+      "epoch": 0.16871941960519657,
+      "eval_loss": 0.9864674210548401,
+      "eval_runtime": 297.0281,
+      "eval_samples_per_second": 8.403,
+      "eval_steps_per_second": 1.05,
+      "step": 500
+    },
+    {
+      "epoch": 0.1855913615657162,
+      "grad_norm": 1.020251750946045,
+      "learning_rate": 0.00019947293050182204,
+      "loss": 0.9978,
+      "step": 550
+    },
+    {
+      "epoch": 0.20246330352623587,
+      "grad_norm": 1.024293303489685,
+      "learning_rate": 0.00019926970488563033,
+      "loss": 0.9751,
+      "step": 600
+    },
+    {
+      "epoch": 0.21933524548675554,
+      "grad_norm": 1.01112699508667,
+      "learning_rate": 0.00019903355423231105,
+      "loss": 0.9533,
+      "step": 650
+    },
+    {
+      "epoch": 0.23620718744727517,
+      "grad_norm": 0.9841225147247314,
+      "learning_rate": 0.00019876455686655583,
+      "loss": 0.9311,
+      "step": 700
+    },
+    {
+      "epoch": 0.2530791294077948,
+      "grad_norm": 0.96900475025177,
+      "learning_rate": 0.00019846280200740965,
+      "loss": 0.9292,
+      "step": 750
+    },
+    {
+      "epoch": 0.2699510713683145,
+      "grad_norm": 0.9467640519142151,
+      "learning_rate": 0.000198128389738679,
+      "loss": 0.9316,
+      "step": 800
+    },
+    {
+      "epoch": 0.28682301332883414,
+      "grad_norm": 0.9673274755477905,
+      "learning_rate": 0.00019776143097573705,
+      "loss": 0.8972,
+      "step": 850
+    },
+    {
+      "epoch": 0.3036949552893538,
+      "grad_norm": 0.8616816401481628,
+      "learning_rate": 0.00019736204742873604,
+      "loss": 0.8998,
+      "step": 900
+    },
+    {
+      "epoch": 0.3205668972498735,
+      "grad_norm": 0.9249860644340515,
+      "learning_rate": 0.00019693037156223942,
+      "loss": 0.8788,
+      "step": 950
+    },
+    {
+      "epoch": 0.33743883921039314,
+      "grad_norm": 0.8553484678268433,
+      "learning_rate": 0.00019646654655128672,
+      "loss": 0.8766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33743883921039314,
+      "eval_loss": 0.8600347638130188,
+      "eval_runtime": 296.4688,
+      "eval_samples_per_second": 8.419,
+      "eval_steps_per_second": 1.052,
+      "step": 1000
+    },
+    {
+      "epoch": 0.35431078117091275,
+      "grad_norm": 0.9525033235549927,
+      "learning_rate": 0.00019597072623390668,
+      "loss": 0.8831,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3711827231314324,
+      "grad_norm": 0.9420183300971985,
+      "learning_rate": 0.00019544307506009313,
+      "loss": 0.8662,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3880546650919521,
+      "grad_norm": 0.9950663447380066,
+      "learning_rate": 0.00019488376803726153,
+      "loss": 0.8687,
+      "step": 1150
+    },
+    {
+      "epoch": 0.40492660705247174,
+      "grad_norm": 0.8866828083992004,
+      "learning_rate": 0.00019429299067220387,
+      "loss": 0.8676,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4217985490129914,
+      "grad_norm": 0.9560692310333252,
+      "learning_rate": 0.00019367093890956108,
+      "loss": 0.8552,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4386704909735111,
+      "grad_norm": 0.9126181602478027,
+      "learning_rate": 0.00019301781906683362,
+      "loss": 0.8335,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4555424329340307,
+      "grad_norm": 0.9437697529792786,
+      "learning_rate": 0.0001923338477659515,
+      "loss": 0.8424,
+      "step": 1350
+    },
+    {
+      "epoch": 0.47241437489455035,
+      "grad_norm": 0.9593287110328674,
+      "learning_rate": 0.00019161925186142692,
+      "loss": 0.8386,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48928631685507,
+      "grad_norm": 0.9925290942192078,
+      "learning_rate": 0.00019087426836511277,
+      "loss": 0.8431,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5061582588155896,
+      "grad_norm": 0.9618675112724304,
+      "learning_rate": 0.00019009914436759223,
+      "loss": 0.8299,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5061582588155896,
+      "eval_loss": 0.8129043579101562,
+      "eval_runtime": 296.3926,
+      "eval_samples_per_second": 8.421,
+      "eval_steps_per_second": 1.053,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5230302007761093,
+      "grad_norm": 0.951849639415741,
+      "learning_rate": 0.00018929413695622572,
+      "loss": 0.8211,
+      "step": 1550
+    },
+    {
+      "epoch": 0.539902142736629,
+      "grad_norm": 0.9019031524658203,
+      "learning_rate": 0.00018845951312988196,
+      "loss": 0.8234,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5567740846971486,
+      "grad_norm": 0.9506643414497375,
+      "learning_rate": 0.00018759554971038196,
+      "loss": 0.8194,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5736460266576683,
+      "grad_norm": 0.9535942077636719,
+      "learning_rate": 0.00018670253325068456,
+      "loss": 0.8017,
+      "step": 1700
+    },
+    {
+      "epoch": 0.590517968618188,
+      "grad_norm": 1.0262322425842285,
+      "learning_rate": 0.00018578075993984488,
+      "loss": 0.8162,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6073899105787076,
+      "grad_norm": 0.8758232593536377,
+      "learning_rate": 0.00018483053550477649,
+      "loss": 0.7917,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6242618525392273,
+      "grad_norm": 0.9430511593818665,
+      "learning_rate": 0.00018385217510885008,
+      "loss": 0.8057,
+      "step": 1850
+    },
+    {
+      "epoch": 0.641133794499747,
+      "grad_norm": 0.8898816108703613,
+      "learning_rate": 0.00018284600324736257,
+      "loss": 0.7983,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6580057364602666,
+      "grad_norm": 0.8455677628517151,
+      "learning_rate": 0.00018181235363991087,
+      "loss": 0.7793,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6748776784207863,
+      "grad_norm": 0.8179712295532227,
+      "learning_rate": 0.00018075156911970616,
+      "loss": 0.7874,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6748776784207863,
+      "eval_loss": 0.774493932723999,
+      "eval_runtime": 296.6518,
+      "eval_samples_per_second": 8.414,
+      "eval_steps_per_second": 1.052,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6917496203813059,
+      "grad_norm": 1.012557029724121,
+      "learning_rate": 0.00017966400151986562,
+      "loss": 0.7822,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7086215623418255,
+      "grad_norm": 0.8059019446372986,
+      "learning_rate": 0.00017855001155671905,
+      "loss": 0.7862,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7254935043023452,
+      "grad_norm": 0.8411868810653687,
+      "learning_rate": 0.00017740996871016903,
+      "loss": 0.7789,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7423654462628648,
+      "grad_norm": 0.9554468989372253,
+      "learning_rate": 0.0001762442511011448,
+      "loss": 0.7709,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7592373882233845,
+      "grad_norm": 0.947722852230072,
+      "learning_rate": 0.00017505324536618968,
+      "loss": 0.7572,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7761093301839042,
+      "grad_norm": 1.0708439350128174,
+      "learning_rate": 0.0001738373465292245,
+      "loss": 0.775,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7929812721444238,
+      "grad_norm": 0.9122579097747803,
+      "learning_rate": 0.00017259695787052895,
+      "loss": 0.7638,
+      "step": 2350
+    },
+    {
+      "epoch": 0.8098532141049435,
+      "grad_norm": 1.1625076532363892,
+      "learning_rate": 0.00017133249079298455,
+      "loss": 0.7654,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8267251560654632,
+      "grad_norm": 0.971483051776886,
+      "learning_rate": 0.0001700443646856237,
+      "loss": 0.7503,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8435970980259828,
+      "grad_norm": 0.9200385808944702,
+      "learning_rate": 0.0001687330067845297,
+      "loss": 0.7752,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8435970980259828,
+      "eval_loss": 0.7435723543167114,
+      "eval_runtime": 296.1583,
+      "eval_samples_per_second": 8.428,
+      "eval_steps_per_second": 1.053,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8604690399865025,
+      "grad_norm": 1.0115654468536377,
+      "learning_rate": 0.00016739885203113442,
+      "loss": 0.7602,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8773409819470221,
+      "grad_norm": 1.0669232606887817,
+      "learning_rate": 0.00016604234292796007,
+      "loss": 0.7585,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8942129239075418,
+      "grad_norm": 0.9092018604278564,
+      "learning_rate": 0.00016466392939185317,
+      "loss": 0.7534,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9110848658680614,
+      "grad_norm": 1.232256293296814,
+      "learning_rate": 0.00016326406860475977,
+      "loss": 0.7418,
+      "step": 2700
+    },
+    {
+      "epoch": 0.927956807828581,
+      "grad_norm": 1.0810918807983398,
+      "learning_rate": 0.00016184322486209043,
+      "loss": 0.7439,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9448287497891007,
+      "grad_norm": 0.9745123982429504,
+      "learning_rate": 0.00016040186941872631,
+      "loss": 0.7421,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9617006917496204,
+      "grad_norm": 1.1044626235961914,
+      "learning_rate": 0.00015894048033271684,
+      "loss": 0.7388,
+      "step": 2850
+    },
+    {
+      "epoch": 0.97857263371014,
+      "grad_norm": 0.9833297729492188,
+      "learning_rate": 0.00015745954230672105,
+      "loss": 0.7364,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9954445756706597,
+      "grad_norm": 1.0058021545410156,
+      "learning_rate": 0.00015595954652724485,
+      "loss": 0.742,
+      "step": 2950
+    },
+    {
+      "epoch": 1.012147798211574,
+      "grad_norm": 1.026547908782959,
+      "learning_rate": 0.00015444099050172807,
+      "loss": 0.6689,
+      "step": 3000
+    },
+    {
+      "epoch": 1.012147798211574,
+      "eval_loss": 0.7237228155136108,
+      "eval_runtime": 296.57,
+      "eval_samples_per_second": 8.416,
+      "eval_steps_per_second": 1.052,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0290197401720937,
+      "grad_norm": 1.1261744499206543,
+      "learning_rate": 0.0001529043778935349,
+      "loss": 0.6521,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0458916821326134,
+      "grad_norm": 1.1573985815048218,
+      "learning_rate": 0.00015138147018095146,
+      "loss": 0.6497,
+      "step": 3100
+    },
+    {
+      "epoch": 1.062763624093133,
+      "grad_norm": 1.098228931427002,
+      "learning_rate": 0.00014981061472467248,
+      "loss": 0.6544,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0796355660536527,
+      "grad_norm": 1.1172102689743042,
+      "learning_rate": 0.00014822323845430378,
+      "loss": 0.6586,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0965075080141724,
+      "grad_norm": 1.016932487487793,
+      "learning_rate": 0.0001466198678589963,
+      "loss": 0.6721,
+      "step": 3250
+    },
+    {
+      "epoch": 1.113379449974692,
+      "grad_norm": 0.972201406955719,
+      "learning_rate": 0.00014500103473277963,
+      "loss": 0.6687,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1302513919352117,
+      "grad_norm": 1.0418280363082886,
+      "learning_rate": 0.0001433672759981806,
+      "loss": 0.6643,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1471233338957314,
+      "grad_norm": 1.0220147371292114,
+      "learning_rate": 0.00014171913352814075,
+      "loss": 0.6538,
+      "step": 3400
+    },
+    {
+      "epoch": 1.163995275856251,
+      "grad_norm": 0.9467246532440186,
+      "learning_rate": 0.000140057153966292,
+      "loss": 0.6427,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1808672178167707,
+      "grad_norm": 1.1602420806884766,
+      "learning_rate": 0.00013838188854564993,
+      "loss": 0.6496,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1808672178167707,
+      "eval_loss": 0.6991020441055298,
+      "eval_runtime": 296.4391,
+      "eval_samples_per_second": 8.42,
+      "eval_steps_per_second": 1.052,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1977391597772904,
+      "grad_norm": 1.0359673500061035,
+      "learning_rate": 0.00013669389290578491,
+      "loss": 0.6574,
+      "step": 3550
+    },
+    {
+      "epoch": 1.21461110173781,
+      "grad_norm": 1.0353167057037354,
+      "learning_rate": 0.0001349937269085317,
+      "loss": 0.6462,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2314830436983297,
+      "grad_norm": 0.9980498552322388,
+      "learning_rate": 0.00013328195445229868,
+      "loss": 0.6515,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2483549856588494,
+      "grad_norm": 1.051719069480896,
+      "learning_rate": 0.0001315591432850381,
+      "loss": 0.6546,
+      "step": 3700
+    },
+    {
+      "epoch": 1.265226927619369,
+      "grad_norm": 1.1057497262954712,
+      "learning_rate": 0.0001298258648159399,
+      "loss": 0.6313,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2820988695798887,
+      "grad_norm": 1.1292750835418701,
+      "learning_rate": 0.0001280826939259106,
+      "loss": 0.6329,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2989708115404084,
+      "grad_norm": 1.1297887563705444,
+      "learning_rate": 0.00012633020877690155,
+      "loss": 0.6384,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3158427535009278,
+      "grad_norm": 0.9869160652160645,
+      "learning_rate": 0.00012456899062014806,
+      "loss": 0.6226,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3327146954614477,
+      "grad_norm": 0.9497949481010437,
+      "learning_rate": 0.00012279962360338447,
+      "loss": 0.6225,
+      "step": 3950
+    },
+    {
+      "epoch": 1.3495866374219672,
+      "grad_norm": 1.053124189376831,
+      "learning_rate": 0.00012102269457709843,
+      "loss": 0.6196,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3495866374219672,
+      "eval_loss": 0.6678062081336975,
+      "eval_runtime": 296.2326,
+      "eval_samples_per_second": 8.426,
+      "eval_steps_per_second": 1.053,
+      "step": 4000
+    },
+    {
+      "epoch": 1.366458579382487,
+      "grad_norm": 1.1627007722854614,
+      "learning_rate": 0.0001192387928998886,
+      "loss": 0.6527,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3833305213430065,
+      "grad_norm": 1.329759955406189,
+      "learning_rate": 0.00011744851024299069,
+      "loss": 0.6297,
+      "step": 4100
+    },
+    {
+      "epoch": 1.4002024633035262,
+      "grad_norm": 1.1688311100006104,
+      "learning_rate": 0.00011565244039403622,
+      "loss": 0.63,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4170744052640458,
+      "grad_norm": 0.9974623918533325,
+      "learning_rate": 0.00011385117906010953,
+      "loss": 0.6394,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4339463472245655,
+      "grad_norm": 1.0659152269363403,
+      "learning_rate": 0.00011204532367016806,
+      "loss": 0.6181,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4508182891850852,
+      "grad_norm": 1.0745680332183838,
+      "learning_rate": 0.00011027170545816326,
+      "loss": 0.6281,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4676902311456048,
+      "grad_norm": 1.1593629121780396,
+      "learning_rate": 0.00010845852214547601,
+      "loss": 0.6296,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4845621731061245,
+      "grad_norm": 1.3579237461090088,
+      "learning_rate": 0.00010664253337309687,
+      "loss": 0.6152,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5014341150666441,
+      "grad_norm": 1.2364161014556885,
+      "learning_rate": 0.00010482434145467046,
+      "loss": 0.6067,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5183060570271638,
+      "grad_norm": 1.0740337371826172,
+      "learning_rate": 0.00010300454943456457,
+      "loss": 0.6175,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5183060570271638,
+      "eval_loss": 0.6415057182312012,
+      "eval_runtime": 296.7867,
+      "eval_samples_per_second": 8.41,
+      "eval_steps_per_second": 1.051,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5351779989876835,
+      "grad_norm": 1.1068068742752075,
+      "learning_rate": 0.00010118376088785673,
+      "loss": 0.6221,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5520499409482031,
+      "grad_norm": 1.1202526092529297,
+      "learning_rate": 9.936257972014506e-05,
+      "loss": 0.6198,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5689218829087228,
+      "grad_norm": 1.1293288469314575,
+      "learning_rate": 9.754160996724927e-05,
+      "loss": 0.5997,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5857938248692425,
+      "grad_norm": 1.0531474351882935,
+      "learning_rate": 9.572145559486855e-05,
+      "loss": 0.6041,
+      "step": 4700
+    },
+    {
+      "epoch": 1.6026657668297621,
+      "grad_norm": 0.895604133605957,
+      "learning_rate": 9.390272029826282e-05,
+      "loss": 0.6005,
+      "step": 4750
+    },
+    {
+      "epoch": 1.6195377087902818,
+      "grad_norm": 1.1192911863327026,
+      "learning_rate": 9.208600730202339e-05,
+      "loss": 0.5992,
+      "step": 4800
+    },
+    {
+      "epoch": 1.6364096507508015,
+      "grad_norm": 1.1751166582107544,
+      "learning_rate": 9.027191916000018e-05,
+      "loss": 0.586,
+      "step": 4850
+    },
+    {
+      "epoch": 1.6532815927113211,
+      "grad_norm": 0.9682310223579407,
+      "learning_rate": 8.846105755545086e-05,
+      "loss": 0.5969,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6701535346718406,
+      "grad_norm": 1.1701383590698242,
+      "learning_rate": 8.665402310147924e-05,
+      "loss": 0.579,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6870254766323605,
+      "grad_norm": 1.0403779745101929,
+      "learning_rate": 8.485141514182825e-05,
+      "loss": 0.5788,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6870254766323605,
+      "eval_loss": 0.6112694144248962,
+      "eval_runtime": 296.234,
+      "eval_samples_per_second": 8.426,
+      "eval_steps_per_second": 1.053,
+      "step": 5000
+    },
+    {
+      "epoch": 1.70389741859288,
+      "grad_norm": 1.126141905784607,
+      "learning_rate": 8.305383155209414e-05,
+      "loss": 0.5862,
+      "step": 5050
+    },
+    {
+      "epoch": 1.7207693605533998,
+      "grad_norm": 1.0474798679351807,
+      "learning_rate": 8.126186854142752e-05,
+      "loss": 0.579,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7376413025139192,
+      "grad_norm": 2.020526647567749,
+      "learning_rate": 7.947612045478724e-05,
+      "loss": 0.5636,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7545132444744391,
+      "grad_norm": 1.0213319063186646,
+      "learning_rate": 7.76971795758122e-05,
+      "loss": 0.5695,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7713851864349586,
+      "grad_norm": 1.034336805343628,
+      "learning_rate": 7.592563593037746e-05,
+      "loss": 0.5849,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7882571283954785,
+      "grad_norm": 1.0699772834777832,
+      "learning_rate": 7.41972662287419e-05,
+      "loss": 0.5714,
+      "step": 5300
+    },
+    {
+      "epoch": 1.805129070355998,
+      "grad_norm": 1.1747502088546753,
+      "learning_rate": 7.244210001050232e-05,
+      "loss": 0.5604,
+      "step": 5350
+    },
+    {
+      "epoch": 1.8220010123165178,
+      "grad_norm": 1.082960605621338,
+      "learning_rate": 7.069607399149428e-05,
+      "loss": 0.5551,
+      "step": 5400
+    },
+    {
+      "epoch": 1.8388729542770372,
+      "grad_norm": 0.9143629670143127,
+      "learning_rate": 6.895976728063694e-05,
+      "loss": 0.5581,
+      "step": 5450
+    },
+    {
+      "epoch": 1.855744896237557,
+      "grad_norm": 1.0392253398895264,
+      "learning_rate": 6.723375576322166e-05,
+      "loss": 0.5506,
+      "step": 5500
+    },
+    {
+      "epoch": 1.855744896237557,
+      "eval_loss": 0.5858550667762756,
+      "eval_runtime": 296.794,
+      "eval_samples_per_second": 8.41,
+      "eval_steps_per_second": 1.051,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8726168381980766,
+      "grad_norm": 1.1118271350860596,
+      "learning_rate": 6.551861190990665e-05,
+      "loss": 0.5508,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8894887801585962,
+      "grad_norm": 1.0717848539352417,
+      "learning_rate": 6.381490458684407e-05,
+      "loss": 0.5489,
+      "step": 5600
+    },
+    {
+      "epoch": 1.906360722119116,
+      "grad_norm": 0.9712995290756226,
+      "learning_rate": 6.212319886700289e-05,
+      "loss": 0.547,
+      "step": 5650
+    },
+    {
+      "epoch": 1.9232326640796356,
+      "grad_norm": 1.0615884065628052,
+      "learning_rate": 6.044405584274961e-05,
+      "loss": 0.54,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9401046060401552,
+      "grad_norm": 1.0537705421447754,
+      "learning_rate": 5.8778032439749284e-05,
+      "loss": 0.5279,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9569765480006749,
+      "grad_norm": 1.081616759300232,
+      "learning_rate": 5.7125681232248684e-05,
+      "loss": 0.5473,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9738484899611946,
+      "grad_norm": 1.1939841508865356,
+      "learning_rate": 5.548755025980237e-05,
+      "loss": 0.5422,
+      "step": 5850
+    },
+    {
+      "epoch": 1.9907204319217142,
+      "grad_norm": 1.1471023559570312,
+      "learning_rate": 5.3864182845503296e-05,
+      "loss": 0.5484,
+      "step": 5900
+    },
+    {
+      "epoch": 2.0074236544626287,
+      "grad_norm": 1.2721205949783325,
+      "learning_rate": 5.225611741577716e-05,
+      "loss": 0.4849,
+      "step": 5950
+    },
+    {
+      "epoch": 2.024295596423148,
+      "grad_norm": 1.3426976203918457,
+      "learning_rate": 5.066388732180136e-05,
+      "loss": 0.4106,
+      "step": 6000
+    },
+    {
+      "epoch": 2.024295596423148,
+      "eval_loss": 0.563025176525116,
+      "eval_runtime": 296.3567,
+      "eval_samples_per_second": 8.422,
+      "eval_steps_per_second": 1.053,
+      "step": 6000
+    },
+    {
+      "epoch": 2.041167538383668,
+      "grad_norm": 1.2091578245162964,
+      "learning_rate": 4.908802066260697e-05,
+      "loss": 0.3968,
+      "step": 6050
+    },
+    {
+      "epoch": 2.0580394803441875,
+      "grad_norm": 1.1719856262207031,
+      "learning_rate": 4.7529040109922584e-05,
+      "loss": 0.4055,
+      "step": 6100
+    },
+    {
+      "epoch": 2.0749114223047074,
+      "grad_norm": 1.3152216672897339,
+      "learning_rate": 4.598746273481881e-05,
+      "loss": 0.395,
+      "step": 6150
+    },
+    {
+      "epoch": 2.091783364265227,
+      "grad_norm": 1.2058014869689941,
+      "learning_rate": 4.446379983620979e-05,
+      "loss": 0.3895,
+      "step": 6200
+    },
+    {
+      "epoch": 2.1086553062257467,
+      "grad_norm": 1.1317533254623413,
+      "learning_rate": 4.2988477878823355e-05,
+      "loss": 0.4027,
+      "step": 6250
+    },
+    {
+      "epoch": 2.125527248186266,
+      "grad_norm": 1.241541862487793,
+      "learning_rate": 4.1501770661428595e-05,
+      "loss": 0.411,
+      "step": 6300
+    },
+    {
+      "epoch": 2.142399190146786,
+      "grad_norm": 1.170419454574585,
+      "learning_rate": 4.003446570150093e-05,
+      "loss": 0.3995,
+      "step": 6350
+    },
+    {
+      "epoch": 2.1592711321073055,
+      "grad_norm": 1.35177743434906,
+      "learning_rate": 3.858704966383232e-05,
+      "loss": 0.4068,
+      "step": 6400
+    },
+    {
+      "epoch": 2.1761430740678254,
+      "grad_norm": 1.2839558124542236,
+      "learning_rate": 3.71600026166051e-05,
+      "loss": 0.4122,
+      "step": 6450
+    },
+    {
+      "epoch": 2.193015016028345,
+      "grad_norm": 1.3839994668960571,
+      "learning_rate": 3.575379787216629e-05,
+      "loss": 0.4044,
+      "step": 6500
+    },
+    {
+      "epoch": 2.193015016028345,
+      "eval_loss": 0.5397977232933044,
+      "eval_runtime": 296.486,
+      "eval_samples_per_second": 8.419,
+      "eval_steps_per_second": 1.052,
+      "step": 6500
+    },
+    {
+      "epoch": 2.2098869579888647,
+      "grad_norm": 1.2803045511245728,
+      "learning_rate": 3.436890183004309e-05,
+      "loss": 0.3822,
+      "step": 6550
+    },
+    {
+      "epoch": 2.226758899949384,
+      "grad_norm": 1.3744126558303833,
+      "learning_rate": 3.300577382225076e-05,
+      "loss": 0.3915,
+      "step": 6600
+    },
+    {
+      "epoch": 2.243630841909904,
+      "grad_norm": 1.1824839115142822,
+      "learning_rate": 3.1664865960945e-05,
+      "loss": 0.3884,
+      "step": 6650
+    },
+    {
+      "epoch": 2.2605027838704235,
+      "grad_norm": 1.3948158025741577,
+      "learning_rate": 3.03466229884686e-05,
+      "loss": 0.382,
+      "step": 6700
+    },
+    {
+      "epoch": 2.277374725830943,
+      "grad_norm": 1.2800052165985107,
+      "learning_rate": 2.9051482129842577e-05,
+      "loss": 0.3914,
+      "step": 6750
+    },
+    {
+      "epoch": 2.294246667791463,
+      "grad_norm": 1.3162052631378174,
+      "learning_rate": 2.777987294775086e-05,
+      "loss": 0.3865,
+      "step": 6800
+    },
+    {
+      "epoch": 2.3111186097519827,
+      "grad_norm": 1.1745421886444092,
+      "learning_rate": 2.6532217200065858e-05,
+      "loss": 0.3752,
+      "step": 6850
+    },
+    {
+      "epoch": 2.327990551712502,
+      "grad_norm": 1.5241754055023193,
+      "learning_rate": 2.5308928699963153e-05,
+      "loss": 0.368,
+      "step": 6900
+    },
+    {
+      "epoch": 2.3448624936730216,
+      "grad_norm": 1.3691622018814087,
+      "learning_rate": 2.4110413178670878e-05,
+      "loss": 0.3715,
+      "step": 6950
+    },
+    {
+      "epoch": 2.3617344356335415,
+      "grad_norm": 1.206244707107544,
+      "learning_rate": 2.2937068150899967e-05,
+      "loss": 0.3781,
+      "step": 7000
+    },
+    {
+      "epoch": 2.3617344356335415,
+      "eval_loss": 0.5177870392799377,
+      "eval_runtime": 296.651,
+      "eval_samples_per_second": 8.414,
+      "eval_steps_per_second": 1.052,
+      "step": 7000
+    },
+    {
+      "epoch": 2.378606377594061,
+      "grad_norm": 1.3172132968902588,
+      "learning_rate": 2.1789282782999254e-05,
+      "loss": 0.3787,
+      "step": 7050
+    },
+    {
+      "epoch": 2.395478319554581,
+      "grad_norm": 1.4043761491775513,
+      "learning_rate": 2.066743776387974e-05,
+      "loss": 0.3798,
+      "step": 7100
+    },
+    {
+      "epoch": 2.4123502615151002,
+      "grad_norm": 1.2046414613723755,
+      "learning_rate": 1.957190517875064e-05,
+      "loss": 0.3755,
+      "step": 7150
+    },
+    {
+      "epoch": 2.42922220347562,
+      "grad_norm": 1.214189052581787,
+      "learning_rate": 1.850304838570879e-05,
+      "loss": 0.3722,
+      "step": 7200
+    },
+    {
+      "epoch": 2.4460941454361396,
+      "grad_norm": 1.3583543300628662,
+      "learning_rate": 1.7461221895222724e-05,
+      "loss": 0.3667,
+      "step": 7250
+    },
+    {
+      "epoch": 2.4629660873966595,
+      "grad_norm": 1.5781126022338867,
+      "learning_rate": 1.644677125255143e-05,
+      "loss": 0.3672,
+      "step": 7300
+    },
+    {
+      "epoch": 2.479838029357179,
+      "grad_norm": 1.2086187601089478,
+      "learning_rate": 1.546003292313629e-05,
+      "loss": 0.358,
+      "step": 7350
+    },
+    {
+      "epoch": 2.496709971317699,
+      "grad_norm": 1.1627147197723389,
+      "learning_rate": 1.4501334181004889e-05,
+      "loss": 0.3826,
+      "step": 7400
+    },
+    {
+      "epoch": 2.5135819132782182,
+      "grad_norm": 1.4055496454238892,
+      "learning_rate": 1.3570993000223043e-05,
+      "loss": 0.356,
+      "step": 7450
+    },
+    {
+      "epoch": 2.530453855238738,
+      "grad_norm": 1.2799688577651978,
+      "learning_rate": 1.2669317949431659e-05,
+      "loss": 0.3576,
+      "step": 7500
+    },
+    {
+      "epoch": 2.530453855238738,
+      "eval_loss": 0.49860402941703796,
+      "eval_runtime": 296.3625,
+      "eval_samples_per_second": 8.422,
+      "eval_steps_per_second": 1.053,
+      "step": 7500
+    },
+    {
+      "epoch": 2.5473257971992576,
+      "grad_norm": 1.321845531463623,
+      "learning_rate": 1.1796608089502948e-05,
+      "loss": 0.3661,
+      "step": 7550
+    },
+    {
+      "epoch": 2.5641977391597774,
+      "grad_norm": 1.3550702333450317,
+      "learning_rate": 1.0953152874350059e-05,
+      "loss": 0.365,
+      "step": 7600
+    },
+    {
+      "epoch": 2.581069681120297,
+      "grad_norm": 1.2584900856018066,
+      "learning_rate": 1.0139232054923287e-05,
+      "loss": 0.3535,
+      "step": 7650
+    },
+    {
+      "epoch": 2.5979416230808168,
+      "grad_norm": 1.376833438873291,
+      "learning_rate": 9.355115586424224e-06,
+      "loss": 0.3502,
+      "step": 7700
+    },
+    {
+      "epoch": 2.614813565041336,
+      "grad_norm": 1.1930843591690063,
+      "learning_rate": 8.601063538769182e-06,
+      "loss": 0.3705,
+      "step": 7750
+    },
+    {
+      "epoch": 2.6316855070018557,
+      "grad_norm": 1.4217926263809204,
+      "learning_rate": 7.877326010330977e-06,
+      "loss": 0.3459,
+      "step": 7800
+    },
+    {
+      "epoch": 2.6485574489623755,
+      "grad_norm": 1.2762850522994995,
+      "learning_rate": 7.1841430449882895e-06,
+      "loss": 0.3436,
+      "step": 7850
+    },
+    {
+      "epoch": 2.6654293909228954,
+      "grad_norm": 1.2758060693740845,
+      "learning_rate": 6.521744552509635e-06,
+      "loss": 0.3575,
+      "step": 7900
+    },
+    {
+      "epoch": 2.682301332883415,
+      "grad_norm": 1.4101016521453857,
+      "learning_rate": 5.890350232298591e-06,
+      "loss": 0.3513,
+      "step": 7950
+    },
+    {
+      "epoch": 2.6991732748439343,
+      "grad_norm": 1.1934149265289307,
+      "learning_rate": 5.290169500525577e-06,
+      "loss": 0.3472,
+      "step": 8000
+    },
+    {
+      "epoch": 2.6991732748439343,
+      "eval_loss": 0.4890361726284027,
+      "eval_runtime": 296.7147,
+      "eval_samples_per_second": 8.412,
+      "eval_steps_per_second": 1.052,
+      "step": 8000
+    },
+    {
+      "epoch": 2.716045216804454,
+      "grad_norm": 1.1966798305511475,
+      "learning_rate": 4.721401420670224e-06,
+      "loss": 0.3476,
+      "step": 8050
+    },
+    {
+      "epoch": 2.732917158764974,
+      "grad_norm": 1.326464295387268,
+      "learning_rate": 4.184234637497486e-06,
+      "loss": 0.3607,
+      "step": 8100
+    },
+    {
+      "epoch": 2.7497891007254935,
+      "grad_norm": 1.1875063180923462,
+      "learning_rate": 3.6788473144893976e-06,
+      "loss": 0.3421,
+      "step": 8150
+    },
+    {
+      "epoch": 2.766661042686013,
+      "grad_norm": 1.318291425704956,
+      "learning_rate": 3.20540707475302e-06,
+      "loss": 0.3567,
+      "step": 8200
+    },
+    {
+      "epoch": 2.783532984646533,
+      "grad_norm": 1.3796924352645874,
+      "learning_rate": 2.7640709454245904e-06,
+      "loss": 0.3543,
+      "step": 8250
+    },
+    {
+      "epoch": 2.8004049266070523,
+      "grad_norm": 1.188733696937561,
+      "learning_rate": 2.3549853055878314e-06,
+      "loss": 0.3461,
+      "step": 8300
+    },
+    {
+      "epoch": 2.817276868567572,
+      "grad_norm": 1.4971429109573364,
+      "learning_rate": 1.978285837724092e-06,
+      "loss": 0.345,
+      "step": 8350
+    },
+    {
+      "epoch": 2.8341488105280916,
+      "grad_norm": 1.218836784362793,
+      "learning_rate": 1.6340974827101286e-06,
+      "loss": 0.3628,
+      "step": 8400
+    },
+    {
+      "epoch": 2.8510207524886115,
+      "grad_norm": 1.3097290992736816,
+      "learning_rate": 1.3225343983787054e-06,
+      "loss": 0.3515,
+      "step": 8450
+    },
+    {
+      "epoch": 2.867892694449131,
+      "grad_norm": 1.602677822113037,
+      "learning_rate": 1.0436999216555276e-06,
+      "loss": 0.3506,
+      "step": 8500
+    },
+    {
+      "epoch": 2.867892694449131,
+      "eval_loss": 0.48525503277778625,
+      "eval_runtime": 296.1461,
+      "eval_samples_per_second": 8.428,
+      "eval_steps_per_second": 1.054,
+      "step": 8500
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 8892,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.897794151360692e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-8500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
+size 5688

checkpoint-8892/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:mistralai/Mistral-7B-Instruct-v0.2
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

checkpoint-8892/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-8892/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a1eee9c4e2ea5c1715066b179c6310891607fce306c1fbbcccd5bec9e964658
+size 335604696

checkpoint-8892/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

checkpoint-8892/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4d77f57ea57de18290a3c24d533d67672053aa6701c28dcd1bcb7c573c485f2
+size 671466706

checkpoint-8892/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b9512d0f3bb196d04b0b009f14bcc5e93ec5a06396c54f604aaf155347c279f
+size 14244

checkpoint-8892/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55643735d58dd4a0834e482b714c2d08dd2314c07893bc0cfe906c0cc1756d43
+size 988

checkpoint-8892/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:451501e506444b3ce6856a786bd233a9c0c78c93bbd16ff058d031b11754317d
+size 1064

checkpoint-8892/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-8892/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-8892/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-8892/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-8892/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1409 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 8892,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016871941960519655,
+      "grad_norm": 2.0177435874938965,
+      "learning_rate": 3.670411985018727e-05,
+      "loss": 1.9696,
+      "step": 50
+    },
+    {
+      "epoch": 0.03374388392103931,
+      "grad_norm": 1.9107558727264404,
+      "learning_rate": 7.415730337078653e-05,
+      "loss": 1.5446,
+      "step": 100
+    },
+    {
+      "epoch": 0.05061582588155897,
+      "grad_norm": 1.3704568147659302,
+      "learning_rate": 0.00011161048689138578,
+      "loss": 1.3815,
+      "step": 150
+    },
+    {
+      "epoch": 0.06748776784207862,
+      "grad_norm": 1.3545180559158325,
+      "learning_rate": 0.00014906367041198505,
+      "loss": 1.2514,
+      "step": 200
+    },
+    {
+      "epoch": 0.08435970980259828,
+      "grad_norm": 1.2799007892608643,
+      "learning_rate": 0.00018651685393258427,
+      "loss": 1.1922,
+      "step": 250
+    },
+    {
+      "epoch": 0.10123165176311794,
+      "grad_norm": 1.1978015899658203,
+      "learning_rate": 0.0001999932072351269,
+      "loss": 1.1556,
+      "step": 300
+    },
+    {
+      "epoch": 0.11810359372363759,
+      "grad_norm": 0.9664444923400879,
+      "learning_rate": 0.00019995539875714444,
+      "loss": 1.0915,
+      "step": 350
+    },
+    {
+      "epoch": 0.13497553568415724,
+      "grad_norm": 1.0787692070007324,
+      "learning_rate": 0.0001998844378161928,
+      "loss": 1.0562,
+      "step": 400
+    },
+    {
+      "epoch": 0.1518474776446769,
+      "grad_norm": 1.0797227621078491,
+      "learning_rate": 0.00019978034794806892,
+      "loss": 1.0253,
+      "step": 450
+    },
+    {
+      "epoch": 0.16871941960519657,
+      "grad_norm": 0.9610917568206787,
+      "learning_rate": 0.00019964316367652584,
+      "loss": 1.0084,
+      "step": 500
+    },
+    {
+      "epoch": 0.16871941960519657,
+      "eval_loss": 0.9864674210548401,
+      "eval_runtime": 297.0281,
+      "eval_samples_per_second": 8.403,
+      "eval_steps_per_second": 1.05,
+      "step": 500
+    },
+    {
+      "epoch": 0.1855913615657162,
+      "grad_norm": 1.020251750946045,
+      "learning_rate": 0.00019947293050182204,
+      "loss": 0.9978,
+      "step": 550
+    },
+    {
+      "epoch": 0.20246330352623587,
+      "grad_norm": 1.024293303489685,
+      "learning_rate": 0.00019926970488563033,
+      "loss": 0.9751,
+      "step": 600
+    },
+    {
+      "epoch": 0.21933524548675554,
+      "grad_norm": 1.01112699508667,
+      "learning_rate": 0.00019903355423231105,
+      "loss": 0.9533,
+      "step": 650
+    },
+    {
+      "epoch": 0.23620718744727517,
+      "grad_norm": 0.9841225147247314,
+      "learning_rate": 0.00019876455686655583,
+      "loss": 0.9311,
+      "step": 700
+    },
+    {
+      "epoch": 0.2530791294077948,
+      "grad_norm": 0.96900475025177,
+      "learning_rate": 0.00019846280200740965,
+      "loss": 0.9292,
+      "step": 750
+    },
+    {
+      "epoch": 0.2699510713683145,
+      "grad_norm": 0.9467640519142151,
+      "learning_rate": 0.000198128389738679,
+      "loss": 0.9316,
+      "step": 800
+    },
+    {
+      "epoch": 0.28682301332883414,
+      "grad_norm": 0.9673274755477905,
+      "learning_rate": 0.00019776143097573705,
+      "loss": 0.8972,
+      "step": 850
+    },
+    {
+      "epoch": 0.3036949552893538,
+      "grad_norm": 0.8616816401481628,
+      "learning_rate": 0.00019736204742873604,
+      "loss": 0.8998,
+      "step": 900
+    },
+    {
+      "epoch": 0.3205668972498735,
+      "grad_norm": 0.9249860644340515,
+      "learning_rate": 0.00019693037156223942,
+      "loss": 0.8788,
+      "step": 950
+    },
+    {
+      "epoch": 0.33743883921039314,
+      "grad_norm": 0.8553484678268433,
+      "learning_rate": 0.00019646654655128672,
+      "loss": 0.8766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.33743883921039314,
+      "eval_loss": 0.8600347638130188,
+      "eval_runtime": 296.4688,
+      "eval_samples_per_second": 8.419,
+      "eval_steps_per_second": 1.052,
+      "step": 1000
+    },
+    {
+      "epoch": 0.35431078117091275,
+      "grad_norm": 0.9525033235549927,
+      "learning_rate": 0.00019597072623390668,
+      "loss": 0.8831,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3711827231314324,
+      "grad_norm": 0.9420183300971985,
+      "learning_rate": 0.00019544307506009313,
+      "loss": 0.8662,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3880546650919521,
+      "grad_norm": 0.9950663447380066,
+      "learning_rate": 0.00019488376803726153,
+      "loss": 0.8687,
+      "step": 1150
+    },
+    {
+      "epoch": 0.40492660705247174,
+      "grad_norm": 0.8866828083992004,
+      "learning_rate": 0.00019429299067220387,
+      "loss": 0.8676,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4217985490129914,
+      "grad_norm": 0.9560692310333252,
+      "learning_rate": 0.00019367093890956108,
+      "loss": 0.8552,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4386704909735111,
+      "grad_norm": 0.9126181602478027,
+      "learning_rate": 0.00019301781906683362,
+      "loss": 0.8335,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4555424329340307,
+      "grad_norm": 0.9437697529792786,
+      "learning_rate": 0.0001923338477659515,
+      "loss": 0.8424,
+      "step": 1350
+    },
+    {
+      "epoch": 0.47241437489455035,
+      "grad_norm": 0.9593287110328674,
+      "learning_rate": 0.00019161925186142692,
+      "loss": 0.8386,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48928631685507,
+      "grad_norm": 0.9925290942192078,
+      "learning_rate": 0.00019087426836511277,
+      "loss": 0.8431,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5061582588155896,
+      "grad_norm": 0.9618675112724304,
+      "learning_rate": 0.00019009914436759223,
+      "loss": 0.8299,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5061582588155896,
+      "eval_loss": 0.8129043579101562,
+      "eval_runtime": 296.3926,
+      "eval_samples_per_second": 8.421,
+      "eval_steps_per_second": 1.053,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5230302007761093,
+      "grad_norm": 0.951849639415741,
+      "learning_rate": 0.00018929413695622572,
+      "loss": 0.8211,
+      "step": 1550
+    },
+    {
+      "epoch": 0.539902142736629,
+      "grad_norm": 0.9019031524658203,
+      "learning_rate": 0.00018845951312988196,
+      "loss": 0.8234,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5567740846971486,
+      "grad_norm": 0.9506643414497375,
+      "learning_rate": 0.00018759554971038196,
+      "loss": 0.8194,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5736460266576683,
+      "grad_norm": 0.9535942077636719,
+      "learning_rate": 0.00018670253325068456,
+      "loss": 0.8017,
+      "step": 1700
+    },
+    {
+      "epoch": 0.590517968618188,
+      "grad_norm": 1.0262322425842285,
+      "learning_rate": 0.00018578075993984488,
+      "loss": 0.8162,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6073899105787076,
+      "grad_norm": 0.8758232593536377,
+      "learning_rate": 0.00018483053550477649,
+      "loss": 0.7917,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6242618525392273,
+      "grad_norm": 0.9430511593818665,
+      "learning_rate": 0.00018385217510885008,
+      "loss": 0.8057,
+      "step": 1850
+    },
+    {
+      "epoch": 0.641133794499747,
+      "grad_norm": 0.8898816108703613,
+      "learning_rate": 0.00018284600324736257,
+      "loss": 0.7983,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6580057364602666,
+      "grad_norm": 0.8455677628517151,
+      "learning_rate": 0.00018181235363991087,
+      "loss": 0.7793,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6748776784207863,
+      "grad_norm": 0.8179712295532227,
+      "learning_rate": 0.00018075156911970616,
+      "loss": 0.7874,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6748776784207863,
+      "eval_loss": 0.774493932723999,
+      "eval_runtime": 296.6518,
+      "eval_samples_per_second": 8.414,
+      "eval_steps_per_second": 1.052,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6917496203813059,
+      "grad_norm": 1.012557029724121,
+      "learning_rate": 0.00017966400151986562,
+      "loss": 0.7822,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7086215623418255,
+      "grad_norm": 0.8059019446372986,
+      "learning_rate": 0.00017855001155671905,
+      "loss": 0.7862,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7254935043023452,
+      "grad_norm": 0.8411868810653687,
+      "learning_rate": 0.00017740996871016903,
+      "loss": 0.7789,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7423654462628648,
+      "grad_norm": 0.9554468989372253,
+      "learning_rate": 0.0001762442511011448,
+      "loss": 0.7709,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7592373882233845,
+      "grad_norm": 0.947722852230072,
+      "learning_rate": 0.00017505324536618968,
+      "loss": 0.7572,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7761093301839042,
+      "grad_norm": 1.0708439350128174,
+      "learning_rate": 0.0001738373465292245,
+      "loss": 0.775,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7929812721444238,
+      "grad_norm": 0.9122579097747803,
+      "learning_rate": 0.00017259695787052895,
+      "loss": 0.7638,
+      "step": 2350
+    },
+    {
+      "epoch": 0.8098532141049435,
+      "grad_norm": 1.1625076532363892,
+      "learning_rate": 0.00017133249079298455,
+      "loss": 0.7654,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8267251560654632,
+      "grad_norm": 0.971483051776886,
+      "learning_rate": 0.0001700443646856237,
+      "loss": 0.7503,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8435970980259828,
+      "grad_norm": 0.9200385808944702,
+      "learning_rate": 0.0001687330067845297,
+      "loss": 0.7752,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8435970980259828,
+      "eval_loss": 0.7435723543167114,
+      "eval_runtime": 296.1583,
+      "eval_samples_per_second": 8.428,
+      "eval_steps_per_second": 1.053,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8604690399865025,
+      "grad_norm": 1.0115654468536377,
+      "learning_rate": 0.00016739885203113442,
+      "loss": 0.7602,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8773409819470221,
+      "grad_norm": 1.0669232606887817,
+      "learning_rate": 0.00016604234292796007,
+      "loss": 0.7585,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8942129239075418,
+      "grad_norm": 0.9092018604278564,
+      "learning_rate": 0.00016466392939185317,
+      "loss": 0.7534,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9110848658680614,
+      "grad_norm": 1.232256293296814,
+      "learning_rate": 0.00016326406860475977,
+      "loss": 0.7418,
+      "step": 2700
+    },
+    {
+      "epoch": 0.927956807828581,
+      "grad_norm": 1.0810918807983398,
+      "learning_rate": 0.00016184322486209043,
+      "loss": 0.7439,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9448287497891007,
+      "grad_norm": 0.9745123982429504,
+      "learning_rate": 0.00016040186941872631,
+      "loss": 0.7421,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9617006917496204,
+      "grad_norm": 1.1044626235961914,
+      "learning_rate": 0.00015894048033271684,
+      "loss": 0.7388,
+      "step": 2850
+    },
+    {
+      "epoch": 0.97857263371014,
+      "grad_norm": 0.9833297729492188,
+      "learning_rate": 0.00015745954230672105,
+      "loss": 0.7364,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9954445756706597,
+      "grad_norm": 1.0058021545410156,
+      "learning_rate": 0.00015595954652724485,
+      "loss": 0.742,
+      "step": 2950
+    },
+    {
+      "epoch": 1.012147798211574,
+      "grad_norm": 1.026547908782959,
+      "learning_rate": 0.00015444099050172807,
+      "loss": 0.6689,
+      "step": 3000
+    },
+    {
+      "epoch": 1.012147798211574,
+      "eval_loss": 0.7237228155136108,
+      "eval_runtime": 296.57,
+      "eval_samples_per_second": 8.416,
+      "eval_steps_per_second": 1.052,
+      "step": 3000
+    },
+    {
+      "epoch": 1.0290197401720937,
+      "grad_norm": 1.1261744499206543,
+      "learning_rate": 0.0001529043778935349,
+      "loss": 0.6521,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0458916821326134,
+      "grad_norm": 1.1573985815048218,
+      "learning_rate": 0.00015138147018095146,
+      "loss": 0.6497,
+      "step": 3100
+    },
+    {
+      "epoch": 1.062763624093133,
+      "grad_norm": 1.098228931427002,
+      "learning_rate": 0.00014981061472467248,
+      "loss": 0.6544,
+      "step": 3150
+    },
+    {
+      "epoch": 1.0796355660536527,
+      "grad_norm": 1.1172102689743042,
+      "learning_rate": 0.00014822323845430378,
+      "loss": 0.6586,
+      "step": 3200
+    },
+    {
+      "epoch": 1.0965075080141724,
+      "grad_norm": 1.016932487487793,
+      "learning_rate": 0.0001466198678589963,
+      "loss": 0.6721,
+      "step": 3250
+    },
+    {
+      "epoch": 1.113379449974692,
+      "grad_norm": 0.972201406955719,
+      "learning_rate": 0.00014500103473277963,
+      "loss": 0.6687,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1302513919352117,
+      "grad_norm": 1.0418280363082886,
+      "learning_rate": 0.0001433672759981806,
+      "loss": 0.6643,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1471233338957314,
+      "grad_norm": 1.0220147371292114,
+      "learning_rate": 0.00014171913352814075,
+      "loss": 0.6538,
+      "step": 3400
+    },
+    {
+      "epoch": 1.163995275856251,
+      "grad_norm": 0.9467246532440186,
+      "learning_rate": 0.000140057153966292,
+      "loss": 0.6427,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1808672178167707,
+      "grad_norm": 1.1602420806884766,
+      "learning_rate": 0.00013838188854564993,
+      "loss": 0.6496,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1808672178167707,
+      "eval_loss": 0.6991020441055298,
+      "eval_runtime": 296.4391,
+      "eval_samples_per_second": 8.42,
+      "eval_steps_per_second": 1.052,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1977391597772904,
+      "grad_norm": 1.0359673500061035,
+      "learning_rate": 0.00013669389290578491,
+      "loss": 0.6574,
+      "step": 3550
+    },
+    {
+      "epoch": 1.21461110173781,
+      "grad_norm": 1.0353167057037354,
+      "learning_rate": 0.0001349937269085317,
+      "loss": 0.6462,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2314830436983297,
+      "grad_norm": 0.9980498552322388,
+      "learning_rate": 0.00013328195445229868,
+      "loss": 0.6515,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2483549856588494,
+      "grad_norm": 1.051719069480896,
+      "learning_rate": 0.0001315591432850381,
+      "loss": 0.6546,
+      "step": 3700
+    },
+    {
+      "epoch": 1.265226927619369,
+      "grad_norm": 1.1057497262954712,
+      "learning_rate": 0.0001298258648159399,
+      "loss": 0.6313,
+      "step": 3750
+    },
+    {
+      "epoch": 1.2820988695798887,
+      "grad_norm": 1.1292750835418701,
+      "learning_rate": 0.0001280826939259106,
+      "loss": 0.6329,
+      "step": 3800
+    },
+    {
+      "epoch": 1.2989708115404084,
+      "grad_norm": 1.1297887563705444,
+      "learning_rate": 0.00012633020877690155,
+      "loss": 0.6384,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3158427535009278,
+      "grad_norm": 0.9869160652160645,
+      "learning_rate": 0.00012456899062014806,
+      "loss": 0.6226,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3327146954614477,
+      "grad_norm": 0.9497949481010437,
+      "learning_rate": 0.00012279962360338447,
+      "loss": 0.6225,
+      "step": 3950
+    },
+    {
+      "epoch": 1.3495866374219672,
+      "grad_norm": 1.053124189376831,
+      "learning_rate": 0.00012102269457709843,
+      "loss": 0.6196,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3495866374219672,
+      "eval_loss": 0.6678062081336975,
+      "eval_runtime": 296.2326,
+      "eval_samples_per_second": 8.426,
+      "eval_steps_per_second": 1.053,
+      "step": 4000
+    },
+    {
+      "epoch": 1.366458579382487,
+      "grad_norm": 1.1627007722854614,
+      "learning_rate": 0.0001192387928998886,
+      "loss": 0.6527,
+      "step": 4050
+    },
+    {
+      "epoch": 1.3833305213430065,
+      "grad_norm": 1.329759955406189,
+      "learning_rate": 0.00011744851024299069,
+      "loss": 0.6297,
+      "step": 4100
+    },
+    {
+      "epoch": 1.4002024633035262,
+      "grad_norm": 1.1688311100006104,
+      "learning_rate": 0.00011565244039403622,
+      "loss": 0.63,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4170744052640458,
+      "grad_norm": 0.9974623918533325,
+      "learning_rate": 0.00011385117906010953,
+      "loss": 0.6394,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4339463472245655,
+      "grad_norm": 1.0659152269363403,
+      "learning_rate": 0.00011204532367016806,
+      "loss": 0.6181,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4508182891850852,
+      "grad_norm": 1.0745680332183838,
+      "learning_rate": 0.00011027170545816326,
+      "loss": 0.6281,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4676902311456048,
+      "grad_norm": 1.1593629121780396,
+      "learning_rate": 0.00010845852214547601,
+      "loss": 0.6296,
+      "step": 4350
+    },
+    {
+      "epoch": 1.4845621731061245,
+      "grad_norm": 1.3579237461090088,
+      "learning_rate": 0.00010664253337309687,
+      "loss": 0.6152,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5014341150666441,
+      "grad_norm": 1.2364161014556885,
+      "learning_rate": 0.00010482434145467046,
+      "loss": 0.6067,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5183060570271638,
+      "grad_norm": 1.0740337371826172,
+      "learning_rate": 0.00010300454943456457,
+      "loss": 0.6175,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5183060570271638,
+      "eval_loss": 0.6415057182312012,
+      "eval_runtime": 296.7867,
+      "eval_samples_per_second": 8.41,
+      "eval_steps_per_second": 1.051,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5351779989876835,
+      "grad_norm": 1.1068068742752075,
+      "learning_rate": 0.00010118376088785673,
+      "loss": 0.6221,
+      "step": 4550
+    },
+    {
+      "epoch": 1.5520499409482031,
+      "grad_norm": 1.1202526092529297,
+      "learning_rate": 9.936257972014506e-05,
+      "loss": 0.6198,
+      "step": 4600
+    },
+    {
+      "epoch": 1.5689218829087228,
+      "grad_norm": 1.1293288469314575,
+      "learning_rate": 9.754160996724927e-05,
+      "loss": 0.5997,
+      "step": 4650
+    },
+    {
+      "epoch": 1.5857938248692425,
+      "grad_norm": 1.0531474351882935,
+      "learning_rate": 9.572145559486855e-05,
+      "loss": 0.6041,
+      "step": 4700
+    },
+    {
+      "epoch": 1.6026657668297621,
+      "grad_norm": 0.895604133605957,
+      "learning_rate": 9.390272029826282e-05,
+      "loss": 0.6005,
+      "step": 4750
+    },
+    {
+      "epoch": 1.6195377087902818,
+      "grad_norm": 1.1192911863327026,
+      "learning_rate": 9.208600730202339e-05,
+      "loss": 0.5992,
+      "step": 4800
+    },
+    {
+      "epoch": 1.6364096507508015,
+      "grad_norm": 1.1751166582107544,
+      "learning_rate": 9.027191916000018e-05,
+      "loss": 0.586,
+      "step": 4850
+    },
+    {
+      "epoch": 1.6532815927113211,
+      "grad_norm": 0.9682310223579407,
+      "learning_rate": 8.846105755545086e-05,
+      "loss": 0.5969,
+      "step": 4900
+    },
+    {
+      "epoch": 1.6701535346718406,
+      "grad_norm": 1.1701383590698242,
+      "learning_rate": 8.665402310147924e-05,
+      "loss": 0.579,
+      "step": 4950
+    },
+    {
+      "epoch": 1.6870254766323605,
+      "grad_norm": 1.0403779745101929,
+      "learning_rate": 8.485141514182825e-05,
+      "loss": 0.5788,
+      "step": 5000
+    },
+    {
+      "epoch": 1.6870254766323605,
+      "eval_loss": 0.6112694144248962,
+      "eval_runtime": 296.234,
+      "eval_samples_per_second": 8.426,
+      "eval_steps_per_second": 1.053,
+      "step": 5000
+    },
+    {
+      "epoch": 1.70389741859288,
+      "grad_norm": 1.126141905784607,
+      "learning_rate": 8.305383155209414e-05,
+      "loss": 0.5862,
+      "step": 5050
+    },
+    {
+      "epoch": 1.7207693605533998,
+      "grad_norm": 1.0474798679351807,
+      "learning_rate": 8.126186854142752e-05,
+      "loss": 0.579,
+      "step": 5100
+    },
+    {
+      "epoch": 1.7376413025139192,
+      "grad_norm": 2.020526647567749,
+      "learning_rate": 7.947612045478724e-05,
+      "loss": 0.5636,
+      "step": 5150
+    },
+    {
+      "epoch": 1.7545132444744391,
+      "grad_norm": 1.0213319063186646,
+      "learning_rate": 7.76971795758122e-05,
+      "loss": 0.5695,
+      "step": 5200
+    },
+    {
+      "epoch": 1.7713851864349586,
+      "grad_norm": 1.034336805343628,
+      "learning_rate": 7.592563593037746e-05,
+      "loss": 0.5849,
+      "step": 5250
+    },
+    {
+      "epoch": 1.7882571283954785,
+      "grad_norm": 1.0699772834777832,
+      "learning_rate": 7.41972662287419e-05,
+      "loss": 0.5714,
+      "step": 5300
+    },
+    {
+      "epoch": 1.805129070355998,
+      "grad_norm": 1.1747502088546753,
+      "learning_rate": 7.244210001050232e-05,
+      "loss": 0.5604,
+      "step": 5350
+    },
+    {
+      "epoch": 1.8220010123165178,
+      "grad_norm": 1.082960605621338,
+      "learning_rate": 7.069607399149428e-05,
+      "loss": 0.5551,
+      "step": 5400
+    },
+    {
+      "epoch": 1.8388729542770372,
+      "grad_norm": 0.9143629670143127,
+      "learning_rate": 6.895976728063694e-05,
+      "loss": 0.5581,
+      "step": 5450
+    },
+    {
+      "epoch": 1.855744896237557,
+      "grad_norm": 1.0392253398895264,
+      "learning_rate": 6.723375576322166e-05,
+      "loss": 0.5506,
+      "step": 5500
+    },
+    {
+      "epoch": 1.855744896237557,
+      "eval_loss": 0.5858550667762756,
+      "eval_runtime": 296.794,
+      "eval_samples_per_second": 8.41,
+      "eval_steps_per_second": 1.051,
+      "step": 5500
+    },
+    {
+      "epoch": 1.8726168381980766,
+      "grad_norm": 1.1118271350860596,
+      "learning_rate": 6.551861190990665e-05,
+      "loss": 0.5508,
+      "step": 5550
+    },
+    {
+      "epoch": 1.8894887801585962,
+      "grad_norm": 1.0717848539352417,
+      "learning_rate": 6.381490458684407e-05,
+      "loss": 0.5489,
+      "step": 5600
+    },
+    {
+      "epoch": 1.906360722119116,
+      "grad_norm": 0.9712995290756226,
+      "learning_rate": 6.212319886700289e-05,
+      "loss": 0.547,
+      "step": 5650
+    },
+    {
+      "epoch": 1.9232326640796356,
+      "grad_norm": 1.0615884065628052,
+      "learning_rate": 6.044405584274961e-05,
+      "loss": 0.54,
+      "step": 5700
+    },
+    {
+      "epoch": 1.9401046060401552,
+      "grad_norm": 1.0537705421447754,
+      "learning_rate": 5.8778032439749284e-05,
+      "loss": 0.5279,
+      "step": 5750
+    },
+    {
+      "epoch": 1.9569765480006749,
+      "grad_norm": 1.081616759300232,
+      "learning_rate": 5.7125681232248684e-05,
+      "loss": 0.5473,
+      "step": 5800
+    },
+    {
+      "epoch": 1.9738484899611946,
+      "grad_norm": 1.1939841508865356,
+      "learning_rate": 5.548755025980237e-05,
+      "loss": 0.5422,
+      "step": 5850
+    },
+    {
+      "epoch": 1.9907204319217142,
+      "grad_norm": 1.1471023559570312,
+      "learning_rate": 5.3864182845503296e-05,
+      "loss": 0.5484,
+      "step": 5900
+    },
+    {
+      "epoch": 2.0074236544626287,
+      "grad_norm": 1.2721205949783325,
+      "learning_rate": 5.225611741577716e-05,
+      "loss": 0.4849,
+      "step": 5950
+    },
+    {
+      "epoch": 2.024295596423148,
+      "grad_norm": 1.3426976203918457,
+      "learning_rate": 5.066388732180136e-05,
+      "loss": 0.4106,
+      "step": 6000
+    },
+    {
+      "epoch": 2.024295596423148,
+      "eval_loss": 0.563025176525116,
+      "eval_runtime": 296.3567,
+      "eval_samples_per_second": 8.422,
+      "eval_steps_per_second": 1.053,
+      "step": 6000
+    },
+    {
+      "epoch": 2.041167538383668,
+      "grad_norm": 1.2091578245162964,
+      "learning_rate": 4.908802066260697e-05,
+      "loss": 0.3968,
+      "step": 6050
+    },
+    {
+      "epoch": 2.0580394803441875,
+      "grad_norm": 1.1719856262207031,
+      "learning_rate": 4.7529040109922584e-05,
+      "loss": 0.4055,
+      "step": 6100
+    },
+    {
+      "epoch": 2.0749114223047074,
+      "grad_norm": 1.3152216672897339,
+      "learning_rate": 4.598746273481881e-05,
+      "loss": 0.395,
+      "step": 6150
+    },
+    {
+      "epoch": 2.091783364265227,
+      "grad_norm": 1.2058014869689941,
+      "learning_rate": 4.446379983620979e-05,
+      "loss": 0.3895,
+      "step": 6200
+    },
+    {
+      "epoch": 2.1086553062257467,
+      "grad_norm": 1.1317533254623413,
+      "learning_rate": 4.2988477878823355e-05,
+      "loss": 0.4027,
+      "step": 6250
+    },
+    {
+      "epoch": 2.125527248186266,
+      "grad_norm": 1.241541862487793,
+      "learning_rate": 4.1501770661428595e-05,
+      "loss": 0.411,
+      "step": 6300
+    },
+    {
+      "epoch": 2.142399190146786,
+      "grad_norm": 1.170419454574585,
+      "learning_rate": 4.003446570150093e-05,
+      "loss": 0.3995,
+      "step": 6350
+    },
+    {
+      "epoch": 2.1592711321073055,
+      "grad_norm": 1.35177743434906,
+      "learning_rate": 3.858704966383232e-05,
+      "loss": 0.4068,
+      "step": 6400
+    },
+    {
+      "epoch": 2.1761430740678254,
+      "grad_norm": 1.2839558124542236,
+      "learning_rate": 3.71600026166051e-05,
+      "loss": 0.4122,
+      "step": 6450
+    },
+    {
+      "epoch": 2.193015016028345,
+      "grad_norm": 1.3839994668960571,
+      "learning_rate": 3.575379787216629e-05,
+      "loss": 0.4044,
+      "step": 6500
+    },
+    {
+      "epoch": 2.193015016028345,
+      "eval_loss": 0.5397977232933044,
+      "eval_runtime": 296.486,
+      "eval_samples_per_second": 8.419,
+      "eval_steps_per_second": 1.052,
+      "step": 6500
+    },
+    {
+      "epoch": 2.2098869579888647,
+      "grad_norm": 1.2803045511245728,
+      "learning_rate": 3.436890183004309e-05,
+      "loss": 0.3822,
+      "step": 6550
+    },
+    {
+      "epoch": 2.226758899949384,
+      "grad_norm": 1.3744126558303833,
+      "learning_rate": 3.300577382225076e-05,
+      "loss": 0.3915,
+      "step": 6600
+    },
+    {
+      "epoch": 2.243630841909904,
+      "grad_norm": 1.1824839115142822,
+      "learning_rate": 3.1664865960945e-05,
+      "loss": 0.3884,
+      "step": 6650
+    },
+    {
+      "epoch": 2.2605027838704235,
+      "grad_norm": 1.3948158025741577,
+      "learning_rate": 3.03466229884686e-05,
+      "loss": 0.382,
+      "step": 6700
+    },
+    {
+      "epoch": 2.277374725830943,
+      "grad_norm": 1.2800052165985107,
+      "learning_rate": 2.9051482129842577e-05,
+      "loss": 0.3914,
+      "step": 6750
+    },
+    {
+      "epoch": 2.294246667791463,
+      "grad_norm": 1.3162052631378174,
+      "learning_rate": 2.777987294775086e-05,
+      "loss": 0.3865,
+      "step": 6800
+    },
+    {
+      "epoch": 2.3111186097519827,
+      "grad_norm": 1.1745421886444092,
+      "learning_rate": 2.6532217200065858e-05,
+      "loss": 0.3752,
+      "step": 6850
+    },
+    {
+      "epoch": 2.327990551712502,
+      "grad_norm": 1.5241754055023193,
+      "learning_rate": 2.5308928699963153e-05,
+      "loss": 0.368,
+      "step": 6900
+    },
+    {
+      "epoch": 2.3448624936730216,
+      "grad_norm": 1.3691622018814087,
+      "learning_rate": 2.4110413178670878e-05,
+      "loss": 0.3715,
+      "step": 6950
+    },
+    {
+      "epoch": 2.3617344356335415,
+      "grad_norm": 1.206244707107544,
+      "learning_rate": 2.2937068150899967e-05,
+      "loss": 0.3781,
+      "step": 7000
+    },
+    {
+      "epoch": 2.3617344356335415,
+      "eval_loss": 0.5177870392799377,
+      "eval_runtime": 296.651,
+      "eval_samples_per_second": 8.414,
+      "eval_steps_per_second": 1.052,
+      "step": 7000
+    },
+    {
+      "epoch": 2.378606377594061,
+      "grad_norm": 1.3172132968902588,
+      "learning_rate": 2.1789282782999254e-05,
+      "loss": 0.3787,
+      "step": 7050
+    },
+    {
+      "epoch": 2.395478319554581,
+      "grad_norm": 1.4043761491775513,
+      "learning_rate": 2.066743776387974e-05,
+      "loss": 0.3798,
+      "step": 7100
+    },
+    {
+      "epoch": 2.4123502615151002,
+      "grad_norm": 1.2046414613723755,
+      "learning_rate": 1.957190517875064e-05,
+      "loss": 0.3755,
+      "step": 7150
+    },
+    {
+      "epoch": 2.42922220347562,
+      "grad_norm": 1.214189052581787,
+      "learning_rate": 1.850304838570879e-05,
+      "loss": 0.3722,
+      "step": 7200
+    },
+    {
+      "epoch": 2.4460941454361396,
+      "grad_norm": 1.3583543300628662,
+      "learning_rate": 1.7461221895222724e-05,
+      "loss": 0.3667,
+      "step": 7250
+    },
+    {
+      "epoch": 2.4629660873966595,
+      "grad_norm": 1.5781126022338867,
+      "learning_rate": 1.644677125255143e-05,
+      "loss": 0.3672,
+      "step": 7300
+    },
+    {
+      "epoch": 2.479838029357179,
+      "grad_norm": 1.2086187601089478,
+      "learning_rate": 1.546003292313629e-05,
+      "loss": 0.358,
+      "step": 7350
+    },
+    {
+      "epoch": 2.496709971317699,
+      "grad_norm": 1.1627147197723389,
+      "learning_rate": 1.4501334181004889e-05,
+      "loss": 0.3826,
+      "step": 7400
+    },
+    {
+      "epoch": 2.5135819132782182,
+      "grad_norm": 1.4055496454238892,
+      "learning_rate": 1.3570993000223043e-05,
+      "loss": 0.356,
+      "step": 7450
+    },
+    {
+      "epoch": 2.530453855238738,
+      "grad_norm": 1.2799688577651978,
+      "learning_rate": 1.2669317949431659e-05,
+      "loss": 0.3576,
+      "step": 7500
+    },
+    {
+      "epoch": 2.530453855238738,
+      "eval_loss": 0.49860402941703796,
+      "eval_runtime": 296.3625,
+      "eval_samples_per_second": 8.422,
+      "eval_steps_per_second": 1.053,
+      "step": 7500
+    },
+    {
+      "epoch": 2.5473257971992576,
+      "grad_norm": 1.321845531463623,
+      "learning_rate": 1.1796608089502948e-05,
+      "loss": 0.3661,
+      "step": 7550
+    },
+    {
+      "epoch": 2.5641977391597774,
+      "grad_norm": 1.3550702333450317,
+      "learning_rate": 1.0953152874350059e-05,
+      "loss": 0.365,
+      "step": 7600
+    },
+    {
+      "epoch": 2.581069681120297,
+      "grad_norm": 1.2584900856018066,
+      "learning_rate": 1.0139232054923287e-05,
+      "loss": 0.3535,
+      "step": 7650
+    },
+    {
+      "epoch": 2.5979416230808168,
+      "grad_norm": 1.376833438873291,
+      "learning_rate": 9.355115586424224e-06,
+      "loss": 0.3502,
+      "step": 7700
+    },
+    {
+      "epoch": 2.614813565041336,
+      "grad_norm": 1.1930843591690063,
+      "learning_rate": 8.601063538769182e-06,
+      "loss": 0.3705,
+      "step": 7750
+    },
+    {
+      "epoch": 2.6316855070018557,
+      "grad_norm": 1.4217926263809204,
+      "learning_rate": 7.877326010330977e-06,
+      "loss": 0.3459,
+      "step": 7800
+    },
+    {
+      "epoch": 2.6485574489623755,
+      "grad_norm": 1.2762850522994995,
+      "learning_rate": 7.1841430449882895e-06,
+      "loss": 0.3436,
+      "step": 7850
+    },
+    {
+      "epoch": 2.6654293909228954,
+      "grad_norm": 1.2758060693740845,
+      "learning_rate": 6.521744552509635e-06,
+      "loss": 0.3575,
+      "step": 7900
+    },
+    {
+      "epoch": 2.682301332883415,
+      "grad_norm": 1.4101016521453857,
+      "learning_rate": 5.890350232298591e-06,
+      "loss": 0.3513,
+      "step": 7950
+    },
+    {
+      "epoch": 2.6991732748439343,
+      "grad_norm": 1.1934149265289307,
+      "learning_rate": 5.290169500525577e-06,
+      "loss": 0.3472,
+      "step": 8000
+    },
+    {
+      "epoch": 2.6991732748439343,
+      "eval_loss": 0.4890361726284027,
+      "eval_runtime": 296.7147,
+      "eval_samples_per_second": 8.412,
+      "eval_steps_per_second": 1.052,
+      "step": 8000
+    },
+    {
+      "epoch": 2.716045216804454,
+      "grad_norm": 1.1966798305511475,
+      "learning_rate": 4.721401420670224e-06,
+      "loss": 0.3476,
+      "step": 8050
+    },
+    {
+      "epoch": 2.732917158764974,
+      "grad_norm": 1.326464295387268,
+      "learning_rate": 4.184234637497486e-06,
+      "loss": 0.3607,
+      "step": 8100
+    },
+    {
+      "epoch": 2.7497891007254935,
+      "grad_norm": 1.1875063180923462,
+      "learning_rate": 3.6788473144893976e-06,
+      "loss": 0.3421,
+      "step": 8150
+    },
+    {
+      "epoch": 2.766661042686013,
+      "grad_norm": 1.318291425704956,
+      "learning_rate": 3.20540707475302e-06,
+      "loss": 0.3567,
+      "step": 8200
+    },
+    {
+      "epoch": 2.783532984646533,
+      "grad_norm": 1.3796924352645874,
+      "learning_rate": 2.7640709454245904e-06,
+      "loss": 0.3543,
+      "step": 8250
+    },
+    {
+      "epoch": 2.8004049266070523,
+      "grad_norm": 1.188733696937561,
+      "learning_rate": 2.3549853055878314e-06,
+      "loss": 0.3461,
+      "step": 8300
+    },
+    {
+      "epoch": 2.817276868567572,
+      "grad_norm": 1.4971429109573364,
+      "learning_rate": 1.978285837724092e-06,
+      "loss": 0.345,
+      "step": 8350
+    },
+    {
+      "epoch": 2.8341488105280916,
+      "grad_norm": 1.218836784362793,
+      "learning_rate": 1.6340974827101286e-06,
+      "loss": 0.3628,
+      "step": 8400
+    },
+    {
+      "epoch": 2.8510207524886115,
+      "grad_norm": 1.3097290992736816,
+      "learning_rate": 1.3225343983787054e-06,
+      "loss": 0.3515,
+      "step": 8450
+    },
+    {
+      "epoch": 2.867892694449131,
+      "grad_norm": 1.602677822113037,
+      "learning_rate": 1.0436999216555276e-06,
+      "loss": 0.3506,
+      "step": 8500
+    },
+    {
+      "epoch": 2.867892694449131,
+      "eval_loss": 0.48525503277778625,
+      "eval_runtime": 296.1461,
+      "eval_samples_per_second": 8.428,
+      "eval_steps_per_second": 1.054,
+      "step": 8500
+    },
+    {
+      "epoch": 2.884764636409651,
+      "grad_norm": 1.332733392715454,
+      "learning_rate": 7.976865342852469e-07,
+      "loss": 0.3562,
+      "step": 8550
+    },
+    {
+      "epoch": 2.9016365783701703,
+      "grad_norm": 1.2944486141204834,
+      "learning_rate": 5.845758321577855e-07,
+      "loss": 0.3544,
+      "step": 8600
+    },
+    {
+      "epoch": 2.91850852033069,
+      "grad_norm": 1.2164134979248047,
+      "learning_rate": 4.0443849824522985e-07,
+      "loss": 0.3432,
+      "step": 8650
+    },
+    {
+      "epoch": 2.9353804622912096,
+      "grad_norm": 1.24102783203125,
+      "learning_rate": 2.5733427915823894e-07,
+      "loss": 0.3481,
+      "step": 8700
+    },
+    {
+      "epoch": 2.9522524042517295,
+      "grad_norm": 1.3387184143066406,
+      "learning_rate": 1.433119653297177e-07,
+      "loss": 0.3533,
+      "step": 8750
+    },
+    {
+      "epoch": 2.969124346212249,
+      "grad_norm": 1.2963091135025024,
+      "learning_rate": 6.240937483235066e-08,
+      "loss": 0.3456,
+      "step": 8800
+    },
+    {
+      "epoch": 2.9859962881727684,
+      "grad_norm": 1.0874204635620117,
+      "learning_rate": 1.4653340835435458e-08,
+      "loss": 0.3543,
+      "step": 8850
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 8892,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.169366398084317e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-8892/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
+size 5688

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89962928316ba9167e9dc9efd4961d21c08e95c6bd6d9ad576dbd0f3c12dcdaf
+size 5688

training_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"base_model": "mistralai/Mistral-7B-Instruct-v0.2"}