MilaWang commited on Mar 28, 2025

Commit

9c03347

verified ·

1 Parent(s): 5e02a74

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/trainer_state.json +1078 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/trainer_state.json +1604 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2217/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/trainer_state.json +2130 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-2956/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-3695/trainer_state.json +2656 -0

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d33014e2f5731367ea6dac1c5d423224567dc295f06ac67fa0093f6ec18d6db9
+size 109069176

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d33014e2f5731367ea6dac1c5d423224567dc295f06ac67fa0093f6ec18d6db9
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c55582fca3ac44be5df62bca9c97dd904099f5e96ecb80704c8955644cf2ac89
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9ec3759c36fb838233297c7b05ec743fddb5e5927529b2c58261df06b7ab81
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:660d34cbbcd33d08eff3fc3d1ae9682bd9ddfa10fef5100de462943540fe3de2
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1078 @@

+{
+  "best_metric": 1.28184974193573,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1478,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013531799729364006,
+      "grad_norm": 1.1050695180892944,
+      "learning_rate": 0.0002,
+      "loss": 1.9104,
+      "step": 10
+    },
+    {
+      "epoch": 0.02706359945872801,
+      "grad_norm": 1.2463239431381226,
+      "learning_rate": 0.0002,
+      "loss": 1.6832,
+      "step": 20
+    },
+    {
+      "epoch": 0.04059539918809202,
+      "grad_norm": 1.193475604057312,
+      "learning_rate": 0.0002,
+      "loss": 1.4273,
+      "step": 30
+    },
+    {
+      "epoch": 0.05412719891745602,
+      "grad_norm": 0.7777265310287476,
+      "learning_rate": 0.0002,
+      "loss": 1.4369,
+      "step": 40
+    },
+    {
+      "epoch": 0.06765899864682003,
+      "grad_norm": 1.4079619646072388,
+      "learning_rate": 0.0002,
+      "loss": 1.3618,
+      "step": 50
+    },
+    {
+      "epoch": 0.08119079837618404,
+      "grad_norm": 0.9451370239257812,
+      "learning_rate": 0.0002,
+      "loss": 1.2639,
+      "step": 60
+    },
+    {
+      "epoch": 0.09472259810554803,
+      "grad_norm": 0.7137989401817322,
+      "learning_rate": 0.0002,
+      "loss": 1.2494,
+      "step": 70
+    },
+    {
+      "epoch": 0.10825439783491204,
+      "grad_norm": 0.5521688461303711,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 80
+    },
+    {
+      "epoch": 0.12178619756427606,
+      "grad_norm": 0.8050252795219421,
+      "learning_rate": 0.0002,
+      "loss": 1.2882,
+      "step": 90
+    },
+    {
+      "epoch": 0.13531799729364005,
+      "grad_norm": 0.6771517992019653,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 100
+    },
+    {
+      "epoch": 0.14884979702300405,
+      "grad_norm": 0.8056462407112122,
+      "learning_rate": 0.0002,
+      "loss": 1.2265,
+      "step": 110
+    },
+    {
+      "epoch": 0.16238159675236807,
+      "grad_norm": 0.7240928411483765,
+      "learning_rate": 0.0002,
+      "loss": 1.3218,
+      "step": 120
+    },
+    {
+      "epoch": 0.17591339648173207,
+      "grad_norm": 0.5310224294662476,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 130
+    },
+    {
+      "epoch": 0.18944519621109607,
+      "grad_norm": 0.5730571150779724,
+      "learning_rate": 0.0002,
+      "loss": 1.2196,
+      "step": 140
+    },
+    {
+      "epoch": 0.2029769959404601,
+      "grad_norm": 0.5288769006729126,
+      "learning_rate": 0.0002,
+      "loss": 1.3183,
+      "step": 150
+    },
+    {
+      "epoch": 0.2165087956698241,
+      "grad_norm": 0.5447443723678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1886,
+      "step": 160
+    },
+    {
+      "epoch": 0.23004059539918809,
+      "grad_norm": 2.2740917205810547,
+      "learning_rate": 0.0002,
+      "loss": 1.1994,
+      "step": 170
+    },
+    {
+      "epoch": 0.2435723951285521,
+      "grad_norm": 0.6200950741767883,
+      "learning_rate": 0.0002,
+      "loss": 1.2398,
+      "step": 180
+    },
+    {
+      "epoch": 0.2571041948579161,
+      "grad_norm": 1.1782610416412354,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 190
+    },
+    {
+      "epoch": 0.2706359945872801,
+      "grad_norm": 0.7893068194389343,
+      "learning_rate": 0.0002,
+      "loss": 1.1697,
+      "step": 200
+    },
+    {
+      "epoch": 0.28416779431664413,
+      "grad_norm": 0.5421761274337769,
+      "learning_rate": 0.0002,
+      "loss": 1.221,
+      "step": 210
+    },
+    {
+      "epoch": 0.2976995940460081,
+      "grad_norm": 0.5460169315338135,
+      "learning_rate": 0.0002,
+      "loss": 1.2019,
+      "step": 220
+    },
+    {
+      "epoch": 0.3112313937753721,
+      "grad_norm": 0.5957289934158325,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 230
+    },
+    {
+      "epoch": 0.32476319350473615,
+      "grad_norm": 0.745093047618866,
+      "learning_rate": 0.0002,
+      "loss": 1.2545,
+      "step": 240
+    },
+    {
+      "epoch": 0.3382949932341001,
+      "grad_norm": 0.622627317905426,
+      "learning_rate": 0.0002,
+      "loss": 1.2408,
+      "step": 250
+    },
+    {
+      "epoch": 0.35182679296346414,
+      "grad_norm": 0.5773138999938965,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 260
+    },
+    {
+      "epoch": 0.36535859269282817,
+      "grad_norm": 1.104275107383728,
+      "learning_rate": 0.0002,
+      "loss": 1.0475,
+      "step": 270
+    },
+    {
+      "epoch": 0.37889039242219213,
+      "grad_norm": 0.5755344033241272,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 280
+    },
+    {
+      "epoch": 0.39242219215155616,
+      "grad_norm": 0.5885311961174011,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 290
+    },
+    {
+      "epoch": 0.4059539918809202,
+      "grad_norm": 1.448182225227356,
+      "learning_rate": 0.0002,
+      "loss": 1.2099,
+      "step": 300
+    },
+    {
+      "epoch": 0.41948579161028415,
+      "grad_norm": 0.5983599424362183,
+      "learning_rate": 0.0002,
+      "loss": 1.2114,
+      "step": 310
+    },
+    {
+      "epoch": 0.4330175913396482,
+      "grad_norm": 0.5013539791107178,
+      "learning_rate": 0.0002,
+      "loss": 1.1691,
+      "step": 320
+    },
+    {
+      "epoch": 0.4465493910690122,
+      "grad_norm": 0.8935738205909729,
+      "learning_rate": 0.0002,
+      "loss": 1.138,
+      "step": 330
+    },
+    {
+      "epoch": 0.46008119079837617,
+      "grad_norm": 0.5642115473747253,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 340
+    },
+    {
+      "epoch": 0.4736129905277402,
+      "grad_norm": 0.7897255420684814,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 350
+    },
+    {
+      "epoch": 0.4871447902571042,
+      "grad_norm": 1.6891459226608276,
+      "learning_rate": 0.0002,
+      "loss": 1.1067,
+      "step": 360
+    },
+    {
+      "epoch": 0.5006765899864682,
+      "grad_norm": 1.1374807357788086,
+      "learning_rate": 0.0002,
+      "loss": 1.1787,
+      "step": 370
+    },
+    {
+      "epoch": 0.5142083897158322,
+      "grad_norm": 0.5355549454689026,
+      "learning_rate": 0.0002,
+      "loss": 1.1525,
+      "step": 380
+    },
+    {
+      "epoch": 0.5277401894451962,
+      "grad_norm": 0.656196653842926,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 390
+    },
+    {
+      "epoch": 0.5412719891745602,
+      "grad_norm": 0.8692356944084167,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 400
+    },
+    {
+      "epoch": 0.5548037889039242,
+      "grad_norm": 0.5873697400093079,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 410
+    },
+    {
+      "epoch": 0.5683355886332883,
+      "grad_norm": 0.8922758102416992,
+      "learning_rate": 0.0002,
+      "loss": 1.1841,
+      "step": 420
+    },
+    {
+      "epoch": 0.5818673883626523,
+      "grad_norm": 0.5048012733459473,
+      "learning_rate": 0.0002,
+      "loss": 1.184,
+      "step": 430
+    },
+    {
+      "epoch": 0.5953991880920162,
+      "grad_norm": 0.603631317615509,
+      "learning_rate": 0.0002,
+      "loss": 1.3276,
+      "step": 440
+    },
+    {
+      "epoch": 0.6089309878213802,
+      "grad_norm": 0.5635734796524048,
+      "learning_rate": 0.0002,
+      "loss": 1.0459,
+      "step": 450
+    },
+    {
+      "epoch": 0.6224627875507442,
+      "grad_norm": 0.7483186721801758,
+      "learning_rate": 0.0002,
+      "loss": 1.1549,
+      "step": 460
+    },
+    {
+      "epoch": 0.6359945872801083,
+      "grad_norm": 0.5372456312179565,
+      "learning_rate": 0.0002,
+      "loss": 1.1141,
+      "step": 470
+    },
+    {
+      "epoch": 0.6495263870094723,
+      "grad_norm": 0.9461246728897095,
+      "learning_rate": 0.0002,
+      "loss": 1.2966,
+      "step": 480
+    },
+    {
+      "epoch": 0.6630581867388363,
+      "grad_norm": 0.7017379403114319,
+      "learning_rate": 0.0002,
+      "loss": 1.1226,
+      "step": 490
+    },
+    {
+      "epoch": 0.6765899864682002,
+      "grad_norm": 1.6177887916564941,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 500
+    },
+    {
+      "epoch": 0.6901217861975643,
+      "grad_norm": 0.4857328236103058,
+      "learning_rate": 0.0002,
+      "loss": 1.0716,
+      "step": 510
+    },
+    {
+      "epoch": 0.7036535859269283,
+      "grad_norm": 1.0545706748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.2898,
+      "step": 520
+    },
+    {
+      "epoch": 0.7171853856562923,
+      "grad_norm": 0.7486541867256165,
+      "learning_rate": 0.0002,
+      "loss": 1.2527,
+      "step": 530
+    },
+    {
+      "epoch": 0.7307171853856563,
+      "grad_norm": 0.6588427424430847,
+      "learning_rate": 0.0002,
+      "loss": 1.0634,
+      "step": 540
+    },
+    {
+      "epoch": 0.7442489851150202,
+      "grad_norm": 0.9485914707183838,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 550
+    },
+    {
+      "epoch": 0.7577807848443843,
+      "grad_norm": 0.7465947866439819,
+      "learning_rate": 0.0002,
+      "loss": 1.1933,
+      "step": 560
+    },
+    {
+      "epoch": 0.7713125845737483,
+      "grad_norm": 0.6392837166786194,
+      "learning_rate": 0.0002,
+      "loss": 1.0927,
+      "step": 570
+    },
+    {
+      "epoch": 0.7848443843031123,
+      "grad_norm": 0.4898282587528229,
+      "learning_rate": 0.0002,
+      "loss": 1.14,
+      "step": 580
+    },
+    {
+      "epoch": 0.7983761840324763,
+      "grad_norm": 0.5636171102523804,
+      "learning_rate": 0.0002,
+      "loss": 1.0425,
+      "step": 590
+    },
+    {
+      "epoch": 0.8119079837618404,
+      "grad_norm": 0.6637675166130066,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 600
+    },
+    {
+      "epoch": 0.8254397834912043,
+      "grad_norm": 1.1842738389968872,
+      "learning_rate": 0.0002,
+      "loss": 1.1204,
+      "step": 610
+    },
+    {
+      "epoch": 0.8389715832205683,
+      "grad_norm": 0.5699004530906677,
+      "learning_rate": 0.0002,
+      "loss": 1.083,
+      "step": 620
+    },
+    {
+      "epoch": 0.8525033829499323,
+      "grad_norm": 0.7748669385910034,
+      "learning_rate": 0.0002,
+      "loss": 1.153,
+      "step": 630
+    },
+    {
+      "epoch": 0.8660351826792964,
+      "grad_norm": 0.7987180352210999,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 640
+    },
+    {
+      "epoch": 0.8795669824086604,
+      "grad_norm": 1.0740629434585571,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 650
+    },
+    {
+      "epoch": 0.8930987821380244,
+      "grad_norm": 0.731082022190094,
+      "learning_rate": 0.0002,
+      "loss": 1.0503,
+      "step": 660
+    },
+    {
+      "epoch": 0.9066305818673883,
+      "grad_norm": 0.9066846966743469,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 670
+    },
+    {
+      "epoch": 0.9201623815967523,
+      "grad_norm": 0.9934597015380859,
+      "learning_rate": 0.0002,
+      "loss": 1.0516,
+      "step": 680
+    },
+    {
+      "epoch": 0.9336941813261164,
+      "grad_norm": 0.7975896000862122,
+      "learning_rate": 0.0002,
+      "loss": 1.0983,
+      "step": 690
+    },
+    {
+      "epoch": 0.9472259810554804,
+      "grad_norm": 0.9127744436264038,
+      "learning_rate": 0.0002,
+      "loss": 1.2362,
+      "step": 700
+    },
+    {
+      "epoch": 0.9607577807848444,
+      "grad_norm": 0.7682064175605774,
+      "learning_rate": 0.0002,
+      "loss": 1.0273,
+      "step": 710
+    },
+    {
+      "epoch": 0.9742895805142084,
+      "grad_norm": 0.9808696508407593,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 720
+    },
+    {
+      "epoch": 0.9878213802435724,
+      "grad_norm": 1.0826992988586426,
+      "learning_rate": 0.0002,
+      "loss": 1.0979,
+      "step": 730
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2867646217346191,
+      "eval_runtime": 53.6569,
+      "eval_samples_per_second": 8.498,
+      "eval_steps_per_second": 1.062,
+      "step": 739
+    },
+    {
+      "epoch": 1.0013531799729365,
+      "grad_norm": 0.6498591303825378,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 740
+    },
+    {
+      "epoch": 1.0148849797023005,
+      "grad_norm": 0.8040738701820374,
+      "learning_rate": 0.0002,
+      "loss": 1.0737,
+      "step": 750
+    },
+    {
+      "epoch": 1.0284167794316643,
+      "grad_norm": 0.7280412912368774,
+      "learning_rate": 0.0002,
+      "loss": 1.03,
+      "step": 760
+    },
+    {
+      "epoch": 1.0419485791610283,
+      "grad_norm": 0.672149121761322,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 770
+    },
+    {
+      "epoch": 1.0554803788903924,
+      "grad_norm": 1.1186308860778809,
+      "learning_rate": 0.0002,
+      "loss": 0.997,
+      "step": 780
+    },
+    {
+      "epoch": 1.0690121786197564,
+      "grad_norm": 0.9073583483695984,
+      "learning_rate": 0.0002,
+      "loss": 0.9088,
+      "step": 790
+    },
+    {
+      "epoch": 1.0825439783491204,
+      "grad_norm": 0.6135605573654175,
+      "learning_rate": 0.0002,
+      "loss": 0.9413,
+      "step": 800
+    },
+    {
+      "epoch": 1.0960757780784844,
+      "grad_norm": 0.5854787826538086,
+      "learning_rate": 0.0002,
+      "loss": 0.9024,
+      "step": 810
+    },
+    {
+      "epoch": 1.1096075778078485,
+      "grad_norm": 0.9077727794647217,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 820
+    },
+    {
+      "epoch": 1.1231393775372125,
+      "grad_norm": 0.7072564363479614,
+      "learning_rate": 0.0002,
+      "loss": 0.9489,
+      "step": 830
+    },
+    {
+      "epoch": 1.1366711772665765,
+      "grad_norm": 0.9457924365997314,
+      "learning_rate": 0.0002,
+      "loss": 0.9275,
+      "step": 840
+    },
+    {
+      "epoch": 1.1502029769959405,
+      "grad_norm": 0.9216122031211853,
+      "learning_rate": 0.0002,
+      "loss": 0.9998,
+      "step": 850
+    },
+    {
+      "epoch": 1.1637347767253043,
+      "grad_norm": 1.0899791717529297,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 860
+    },
+    {
+      "epoch": 1.1772665764546684,
+      "grad_norm": 0.8594662547111511,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 870
+    },
+    {
+      "epoch": 1.1907983761840324,
+      "grad_norm": 0.8680914640426636,
+      "learning_rate": 0.0002,
+      "loss": 0.9513,
+      "step": 880
+    },
+    {
+      "epoch": 1.2043301759133964,
+      "grad_norm": 0.5579341650009155,
+      "learning_rate": 0.0002,
+      "loss": 0.9695,
+      "step": 890
+    },
+    {
+      "epoch": 1.2178619756427604,
+      "grad_norm": 0.8556986451148987,
+      "learning_rate": 0.0002,
+      "loss": 1.0153,
+      "step": 900
+    },
+    {
+      "epoch": 1.2313937753721245,
+      "grad_norm": 1.8943263292312622,
+      "learning_rate": 0.0002,
+      "loss": 0.9589,
+      "step": 910
+    },
+    {
+      "epoch": 1.2449255751014885,
+      "grad_norm": 0.7652221918106079,
+      "learning_rate": 0.0002,
+      "loss": 0.9554,
+      "step": 920
+    },
+    {
+      "epoch": 1.2584573748308525,
+      "grad_norm": 0.6921482086181641,
+      "learning_rate": 0.0002,
+      "loss": 0.955,
+      "step": 930
+    },
+    {
+      "epoch": 1.2719891745602165,
+      "grad_norm": 0.7211646437644958,
+      "learning_rate": 0.0002,
+      "loss": 1.0335,
+      "step": 940
+    },
+    {
+      "epoch": 1.2855209742895806,
+      "grad_norm": 0.9096421599388123,
+      "learning_rate": 0.0002,
+      "loss": 1.0597,
+      "step": 950
+    },
+    {
+      "epoch": 1.2990527740189446,
+      "grad_norm": 0.743715226650238,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 960
+    },
+    {
+      "epoch": 1.3125845737483086,
+      "grad_norm": 0.9247064590454102,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 970
+    },
+    {
+      "epoch": 1.3261163734776726,
+      "grad_norm": 1.0811798572540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9534,
+      "step": 980
+    },
+    {
+      "epoch": 1.3396481732070367,
+      "grad_norm": 0.7317015528678894,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 990
+    },
+    {
+      "epoch": 1.3531799729364005,
+      "grad_norm": 0.8399309515953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0071,
+      "step": 1000
+    },
+    {
+      "epoch": 1.3667117726657645,
+      "grad_norm": 1.094558835029602,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3802435723951285,
+      "grad_norm": 1.3759856224060059,
+      "learning_rate": 0.0002,
+      "loss": 0.8744,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3937753721244925,
+      "grad_norm": 0.8855497241020203,
+      "learning_rate": 0.0002,
+      "loss": 0.915,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4073071718538566,
+      "grad_norm": 3.6836671829223633,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1040
+    },
+    {
+      "epoch": 1.4208389715832206,
+      "grad_norm": 1.1119214296340942,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1050
+    },
+    {
+      "epoch": 1.4343707713125846,
+      "grad_norm": 0.8871118426322937,
+      "learning_rate": 0.0002,
+      "loss": 0.9381,
+      "step": 1060
+    },
+    {
+      "epoch": 1.4479025710419486,
+      "grad_norm": 0.9937213063240051,
+      "learning_rate": 0.0002,
+      "loss": 0.9091,
+      "step": 1070
+    },
+    {
+      "epoch": 1.4614343707713127,
+      "grad_norm": 0.7206485867500305,
+      "learning_rate": 0.0002,
+      "loss": 0.9923,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4749661705006765,
+      "grad_norm": 0.8442404866218567,
+      "learning_rate": 0.0002,
+      "loss": 0.951,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4884979702300405,
+      "grad_norm": 0.9265049695968628,
+      "learning_rate": 0.0002,
+      "loss": 0.8609,
+      "step": 1100
+    },
+    {
+      "epoch": 1.5020297699594045,
+      "grad_norm": 1.1033650636672974,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 1110
+    },
+    {
+      "epoch": 1.5155615696887685,
+      "grad_norm": 0.7876176834106445,
+      "learning_rate": 0.0002,
+      "loss": 1.004,
+      "step": 1120
+    },
+    {
+      "epoch": 1.5290933694181326,
+      "grad_norm": 0.7761271595954895,
+      "learning_rate": 0.0002,
+      "loss": 0.9555,
+      "step": 1130
+    },
+    {
+      "epoch": 1.5426251691474966,
+      "grad_norm": 1.0603803396224976,
+      "learning_rate": 0.0002,
+      "loss": 0.9569,
+      "step": 1140
+    },
+    {
+      "epoch": 1.5561569688768606,
+      "grad_norm": 0.7715556621551514,
+      "learning_rate": 0.0002,
+      "loss": 0.9842,
+      "step": 1150
+    },
+    {
+      "epoch": 1.5696887686062246,
+      "grad_norm": 0.6591511368751526,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5832205683355887,
+      "grad_norm": 1.1773475408554077,
+      "learning_rate": 0.0002,
+      "loss": 0.9584,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5967523680649527,
+      "grad_norm": 0.8513862490653992,
+      "learning_rate": 0.0002,
+      "loss": 0.9229,
+      "step": 1180
+    },
+    {
+      "epoch": 1.6102841677943167,
+      "grad_norm": 1.0796581506729126,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1190
+    },
+    {
+      "epoch": 1.6238159675236807,
+      "grad_norm": 0.8897230625152588,
+      "learning_rate": 0.0002,
+      "loss": 0.9698,
+      "step": 1200
+    },
+    {
+      "epoch": 1.6373477672530448,
+      "grad_norm": 1.4640971422195435,
+      "learning_rate": 0.0002,
+      "loss": 0.9295,
+      "step": 1210
+    },
+    {
+      "epoch": 1.6508795669824088,
+      "grad_norm": 1.123056173324585,
+      "learning_rate": 0.0002,
+      "loss": 1.003,
+      "step": 1220
+    },
+    {
+      "epoch": 1.6644113667117728,
+      "grad_norm": 1.1064175367355347,
+      "learning_rate": 0.0002,
+      "loss": 0.9524,
+      "step": 1230
+    },
+    {
+      "epoch": 1.6779431664411368,
+      "grad_norm": 2.4434642791748047,
+      "learning_rate": 0.0002,
+      "loss": 0.8896,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6914749661705006,
+      "grad_norm": 1.0455760955810547,
+      "learning_rate": 0.0002,
+      "loss": 0.9899,
+      "step": 1250
+    },
+    {
+      "epoch": 1.7050067658998647,
+      "grad_norm": 1.1007593870162964,
+      "learning_rate": 0.0002,
+      "loss": 0.9032,
+      "step": 1260
+    },
+    {
+      "epoch": 1.7185385656292287,
+      "grad_norm": 1.2697606086730957,
+      "learning_rate": 0.0002,
+      "loss": 0.9226,
+      "step": 1270
+    },
+    {
+      "epoch": 1.7320703653585927,
+      "grad_norm": 1.1537855863571167,
+      "learning_rate": 0.0002,
+      "loss": 0.8771,
+      "step": 1280
+    },
+    {
+      "epoch": 1.7456021650879567,
+      "grad_norm": 0.9637187719345093,
+      "learning_rate": 0.0002,
+      "loss": 0.8655,
+      "step": 1290
+    },
+    {
+      "epoch": 1.7591339648173205,
+      "grad_norm": 1.1610347032546997,
+      "learning_rate": 0.0002,
+      "loss": 0.9641,
+      "step": 1300
+    },
+    {
+      "epoch": 1.7726657645466846,
+      "grad_norm": 0.717607319355011,
+      "learning_rate": 0.0002,
+      "loss": 0.9417,
+      "step": 1310
+    },
+    {
+      "epoch": 1.7861975642760486,
+      "grad_norm": 1.753371238708496,
+      "learning_rate": 0.0002,
+      "loss": 0.8852,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7997293640054126,
+      "grad_norm": 0.7919637560844421,
+      "learning_rate": 0.0002,
+      "loss": 1.0327,
+      "step": 1330
+    },
+    {
+      "epoch": 1.8132611637347766,
+      "grad_norm": 1.1091023683547974,
+      "learning_rate": 0.0002,
+      "loss": 1.0019,
+      "step": 1340
+    },
+    {
+      "epoch": 1.8267929634641407,
+      "grad_norm": 0.7157362699508667,
+      "learning_rate": 0.0002,
+      "loss": 0.9457,
+      "step": 1350
+    },
+    {
+      "epoch": 1.8403247631935047,
+      "grad_norm": 0.9538856744766235,
+      "learning_rate": 0.0002,
+      "loss": 0.9818,
+      "step": 1360
+    },
+    {
+      "epoch": 1.8538565629228687,
+      "grad_norm": 1.689642071723938,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 1370
+    },
+    {
+      "epoch": 1.8673883626522327,
+      "grad_norm": 1.3405762910842896,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1380
+    },
+    {
+      "epoch": 1.8809201623815968,
+      "grad_norm": 1.187905192375183,
+      "learning_rate": 0.0002,
+      "loss": 0.9279,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8944519621109608,
+      "grad_norm": 1.403511643409729,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 1400
+    },
+    {
+      "epoch": 1.9079837618403248,
+      "grad_norm": 1.4245457649230957,
+      "learning_rate": 0.0002,
+      "loss": 0.9654,
+      "step": 1410
+    },
+    {
+      "epoch": 1.9215155615696888,
+      "grad_norm": 0.6742255687713623,
+      "learning_rate": 0.0002,
+      "loss": 0.9047,
+      "step": 1420
+    },
+    {
+      "epoch": 1.9350473612990529,
+      "grad_norm": 0.9301473498344421,
+      "learning_rate": 0.0002,
+      "loss": 0.9837,
+      "step": 1430
+    },
+    {
+      "epoch": 1.9485791610284169,
+      "grad_norm": 0.8039385080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 1440
+    },
+    {
+      "epoch": 1.962110960757781,
+      "grad_norm": 0.7449126839637756,
+      "learning_rate": 0.0002,
+      "loss": 0.9433,
+      "step": 1450
+    },
+    {
+      "epoch": 1.975642760487145,
+      "grad_norm": 1.8016695976257324,
+      "learning_rate": 0.0002,
+      "loss": 0.9424,
+      "step": 1460
+    },
+    {
+      "epoch": 1.989174560216509,
+      "grad_norm": 1.3347259759902954,
+      "learning_rate": 0.0002,
+      "loss": 0.9434,
+      "step": 1470
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.28184974193573,
+      "eval_runtime": 54.9872,
+      "eval_samples_per_second": 8.293,
+      "eval_steps_per_second": 1.037,
+      "step": 1478
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5912,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.48468460732416e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c0e60ee7ec9d35429ff5330b6a72628063da6a9c843e01fa8c503c2ba1303f
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71a6704132ed2c4dd0aa66313a92b65f87647244f514b4fdbae8ff47504b3ced
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:078d223fdedd86e4629bbdc96960f3e0544c28e30511d4cf3fd39b351b1a8e28
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50ca2f4bcec4bc0625780e16ebdd8004300ebde5f39c5363d49a2ce5a8cb507c
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a55b0153b394031eb5cf421dcf9a4ccfd985b2991cd2bddd6697482b10d42a60
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1604 @@

+{
+  "best_metric": 1.28184974193573,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478",
+  "epoch": 3.0,
+  "eval_steps": 10,
+  "global_step": 2217,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013531799729364006,
+      "grad_norm": 1.1050695180892944,
+      "learning_rate": 0.0002,
+      "loss": 1.9104,
+      "step": 10
+    },
+    {
+      "epoch": 0.02706359945872801,
+      "grad_norm": 1.2463239431381226,
+      "learning_rate": 0.0002,
+      "loss": 1.6832,
+      "step": 20
+    },
+    {
+      "epoch": 0.04059539918809202,
+      "grad_norm": 1.193475604057312,
+      "learning_rate": 0.0002,
+      "loss": 1.4273,
+      "step": 30
+    },
+    {
+      "epoch": 0.05412719891745602,
+      "grad_norm": 0.7777265310287476,
+      "learning_rate": 0.0002,
+      "loss": 1.4369,
+      "step": 40
+    },
+    {
+      "epoch": 0.06765899864682003,
+      "grad_norm": 1.4079619646072388,
+      "learning_rate": 0.0002,
+      "loss": 1.3618,
+      "step": 50
+    },
+    {
+      "epoch": 0.08119079837618404,
+      "grad_norm": 0.9451370239257812,
+      "learning_rate": 0.0002,
+      "loss": 1.2639,
+      "step": 60
+    },
+    {
+      "epoch": 0.09472259810554803,
+      "grad_norm": 0.7137989401817322,
+      "learning_rate": 0.0002,
+      "loss": 1.2494,
+      "step": 70
+    },
+    {
+      "epoch": 0.10825439783491204,
+      "grad_norm": 0.5521688461303711,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 80
+    },
+    {
+      "epoch": 0.12178619756427606,
+      "grad_norm": 0.8050252795219421,
+      "learning_rate": 0.0002,
+      "loss": 1.2882,
+      "step": 90
+    },
+    {
+      "epoch": 0.13531799729364005,
+      "grad_norm": 0.6771517992019653,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 100
+    },
+    {
+      "epoch": 0.14884979702300405,
+      "grad_norm": 0.8056462407112122,
+      "learning_rate": 0.0002,
+      "loss": 1.2265,
+      "step": 110
+    },
+    {
+      "epoch": 0.16238159675236807,
+      "grad_norm": 0.7240928411483765,
+      "learning_rate": 0.0002,
+      "loss": 1.3218,
+      "step": 120
+    },
+    {
+      "epoch": 0.17591339648173207,
+      "grad_norm": 0.5310224294662476,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 130
+    },
+    {
+      "epoch": 0.18944519621109607,
+      "grad_norm": 0.5730571150779724,
+      "learning_rate": 0.0002,
+      "loss": 1.2196,
+      "step": 140
+    },
+    {
+      "epoch": 0.2029769959404601,
+      "grad_norm": 0.5288769006729126,
+      "learning_rate": 0.0002,
+      "loss": 1.3183,
+      "step": 150
+    },
+    {
+      "epoch": 0.2165087956698241,
+      "grad_norm": 0.5447443723678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1886,
+      "step": 160
+    },
+    {
+      "epoch": 0.23004059539918809,
+      "grad_norm": 2.2740917205810547,
+      "learning_rate": 0.0002,
+      "loss": 1.1994,
+      "step": 170
+    },
+    {
+      "epoch": 0.2435723951285521,
+      "grad_norm": 0.6200950741767883,
+      "learning_rate": 0.0002,
+      "loss": 1.2398,
+      "step": 180
+    },
+    {
+      "epoch": 0.2571041948579161,
+      "grad_norm": 1.1782610416412354,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 190
+    },
+    {
+      "epoch": 0.2706359945872801,
+      "grad_norm": 0.7893068194389343,
+      "learning_rate": 0.0002,
+      "loss": 1.1697,
+      "step": 200
+    },
+    {
+      "epoch": 0.28416779431664413,
+      "grad_norm": 0.5421761274337769,
+      "learning_rate": 0.0002,
+      "loss": 1.221,
+      "step": 210
+    },
+    {
+      "epoch": 0.2976995940460081,
+      "grad_norm": 0.5460169315338135,
+      "learning_rate": 0.0002,
+      "loss": 1.2019,
+      "step": 220
+    },
+    {
+      "epoch": 0.3112313937753721,
+      "grad_norm": 0.5957289934158325,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 230
+    },
+    {
+      "epoch": 0.32476319350473615,
+      "grad_norm": 0.745093047618866,
+      "learning_rate": 0.0002,
+      "loss": 1.2545,
+      "step": 240
+    },
+    {
+      "epoch": 0.3382949932341001,
+      "grad_norm": 0.622627317905426,
+      "learning_rate": 0.0002,
+      "loss": 1.2408,
+      "step": 250
+    },
+    {
+      "epoch": 0.35182679296346414,
+      "grad_norm": 0.5773138999938965,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 260
+    },
+    {
+      "epoch": 0.36535859269282817,
+      "grad_norm": 1.104275107383728,
+      "learning_rate": 0.0002,
+      "loss": 1.0475,
+      "step": 270
+    },
+    {
+      "epoch": 0.37889039242219213,
+      "grad_norm": 0.5755344033241272,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 280
+    },
+    {
+      "epoch": 0.39242219215155616,
+      "grad_norm": 0.5885311961174011,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 290
+    },
+    {
+      "epoch": 0.4059539918809202,
+      "grad_norm": 1.448182225227356,
+      "learning_rate": 0.0002,
+      "loss": 1.2099,
+      "step": 300
+    },
+    {
+      "epoch": 0.41948579161028415,
+      "grad_norm": 0.5983599424362183,
+      "learning_rate": 0.0002,
+      "loss": 1.2114,
+      "step": 310
+    },
+    {
+      "epoch": 0.4330175913396482,
+      "grad_norm": 0.5013539791107178,
+      "learning_rate": 0.0002,
+      "loss": 1.1691,
+      "step": 320
+    },
+    {
+      "epoch": 0.4465493910690122,
+      "grad_norm": 0.8935738205909729,
+      "learning_rate": 0.0002,
+      "loss": 1.138,
+      "step": 330
+    },
+    {
+      "epoch": 0.46008119079837617,
+      "grad_norm": 0.5642115473747253,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 340
+    },
+    {
+      "epoch": 0.4736129905277402,
+      "grad_norm": 0.7897255420684814,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 350
+    },
+    {
+      "epoch": 0.4871447902571042,
+      "grad_norm": 1.6891459226608276,
+      "learning_rate": 0.0002,
+      "loss": 1.1067,
+      "step": 360
+    },
+    {
+      "epoch": 0.5006765899864682,
+      "grad_norm": 1.1374807357788086,
+      "learning_rate": 0.0002,
+      "loss": 1.1787,
+      "step": 370
+    },
+    {
+      "epoch": 0.5142083897158322,
+      "grad_norm": 0.5355549454689026,
+      "learning_rate": 0.0002,
+      "loss": 1.1525,
+      "step": 380
+    },
+    {
+      "epoch": 0.5277401894451962,
+      "grad_norm": 0.656196653842926,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 390
+    },
+    {
+      "epoch": 0.5412719891745602,
+      "grad_norm": 0.8692356944084167,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 400
+    },
+    {
+      "epoch": 0.5548037889039242,
+      "grad_norm": 0.5873697400093079,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 410
+    },
+    {
+      "epoch": 0.5683355886332883,
+      "grad_norm": 0.8922758102416992,
+      "learning_rate": 0.0002,
+      "loss": 1.1841,
+      "step": 420
+    },
+    {
+      "epoch": 0.5818673883626523,
+      "grad_norm": 0.5048012733459473,
+      "learning_rate": 0.0002,
+      "loss": 1.184,
+      "step": 430
+    },
+    {
+      "epoch": 0.5953991880920162,
+      "grad_norm": 0.603631317615509,
+      "learning_rate": 0.0002,
+      "loss": 1.3276,
+      "step": 440
+    },
+    {
+      "epoch": 0.6089309878213802,
+      "grad_norm": 0.5635734796524048,
+      "learning_rate": 0.0002,
+      "loss": 1.0459,
+      "step": 450
+    },
+    {
+      "epoch": 0.6224627875507442,
+      "grad_norm": 0.7483186721801758,
+      "learning_rate": 0.0002,
+      "loss": 1.1549,
+      "step": 460
+    },
+    {
+      "epoch": 0.6359945872801083,
+      "grad_norm": 0.5372456312179565,
+      "learning_rate": 0.0002,
+      "loss": 1.1141,
+      "step": 470
+    },
+    {
+      "epoch": 0.6495263870094723,
+      "grad_norm": 0.9461246728897095,
+      "learning_rate": 0.0002,
+      "loss": 1.2966,
+      "step": 480
+    },
+    {
+      "epoch": 0.6630581867388363,
+      "grad_norm": 0.7017379403114319,
+      "learning_rate": 0.0002,
+      "loss": 1.1226,
+      "step": 490
+    },
+    {
+      "epoch": 0.6765899864682002,
+      "grad_norm": 1.6177887916564941,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 500
+    },
+    {
+      "epoch": 0.6901217861975643,
+      "grad_norm": 0.4857328236103058,
+      "learning_rate": 0.0002,
+      "loss": 1.0716,
+      "step": 510
+    },
+    {
+      "epoch": 0.7036535859269283,
+      "grad_norm": 1.0545706748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.2898,
+      "step": 520
+    },
+    {
+      "epoch": 0.7171853856562923,
+      "grad_norm": 0.7486541867256165,
+      "learning_rate": 0.0002,
+      "loss": 1.2527,
+      "step": 530
+    },
+    {
+      "epoch": 0.7307171853856563,
+      "grad_norm": 0.6588427424430847,
+      "learning_rate": 0.0002,
+      "loss": 1.0634,
+      "step": 540
+    },
+    {
+      "epoch": 0.7442489851150202,
+      "grad_norm": 0.9485914707183838,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 550
+    },
+    {
+      "epoch": 0.7577807848443843,
+      "grad_norm": 0.7465947866439819,
+      "learning_rate": 0.0002,
+      "loss": 1.1933,
+      "step": 560
+    },
+    {
+      "epoch": 0.7713125845737483,
+      "grad_norm": 0.6392837166786194,
+      "learning_rate": 0.0002,
+      "loss": 1.0927,
+      "step": 570
+    },
+    {
+      "epoch": 0.7848443843031123,
+      "grad_norm": 0.4898282587528229,
+      "learning_rate": 0.0002,
+      "loss": 1.14,
+      "step": 580
+    },
+    {
+      "epoch": 0.7983761840324763,
+      "grad_norm": 0.5636171102523804,
+      "learning_rate": 0.0002,
+      "loss": 1.0425,
+      "step": 590
+    },
+    {
+      "epoch": 0.8119079837618404,
+      "grad_norm": 0.6637675166130066,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 600
+    },
+    {
+      "epoch": 0.8254397834912043,
+      "grad_norm": 1.1842738389968872,
+      "learning_rate": 0.0002,
+      "loss": 1.1204,
+      "step": 610
+    },
+    {
+      "epoch": 0.8389715832205683,
+      "grad_norm": 0.5699004530906677,
+      "learning_rate": 0.0002,
+      "loss": 1.083,
+      "step": 620
+    },
+    {
+      "epoch": 0.8525033829499323,
+      "grad_norm": 0.7748669385910034,
+      "learning_rate": 0.0002,
+      "loss": 1.153,
+      "step": 630
+    },
+    {
+      "epoch": 0.8660351826792964,
+      "grad_norm": 0.7987180352210999,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 640
+    },
+    {
+      "epoch": 0.8795669824086604,
+      "grad_norm": 1.0740629434585571,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 650
+    },
+    {
+      "epoch": 0.8930987821380244,
+      "grad_norm": 0.731082022190094,
+      "learning_rate": 0.0002,
+      "loss": 1.0503,
+      "step": 660
+    },
+    {
+      "epoch": 0.9066305818673883,
+      "grad_norm": 0.9066846966743469,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 670
+    },
+    {
+      "epoch": 0.9201623815967523,
+      "grad_norm": 0.9934597015380859,
+      "learning_rate": 0.0002,
+      "loss": 1.0516,
+      "step": 680
+    },
+    {
+      "epoch": 0.9336941813261164,
+      "grad_norm": 0.7975896000862122,
+      "learning_rate": 0.0002,
+      "loss": 1.0983,
+      "step": 690
+    },
+    {
+      "epoch": 0.9472259810554804,
+      "grad_norm": 0.9127744436264038,
+      "learning_rate": 0.0002,
+      "loss": 1.2362,
+      "step": 700
+    },
+    {
+      "epoch": 0.9607577807848444,
+      "grad_norm": 0.7682064175605774,
+      "learning_rate": 0.0002,
+      "loss": 1.0273,
+      "step": 710
+    },
+    {
+      "epoch": 0.9742895805142084,
+      "grad_norm": 0.9808696508407593,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 720
+    },
+    {
+      "epoch": 0.9878213802435724,
+      "grad_norm": 1.0826992988586426,
+      "learning_rate": 0.0002,
+      "loss": 1.0979,
+      "step": 730
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2867646217346191,
+      "eval_runtime": 53.6569,
+      "eval_samples_per_second": 8.498,
+      "eval_steps_per_second": 1.062,
+      "step": 739
+    },
+    {
+      "epoch": 1.0013531799729365,
+      "grad_norm": 0.6498591303825378,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 740
+    },
+    {
+      "epoch": 1.0148849797023005,
+      "grad_norm": 0.8040738701820374,
+      "learning_rate": 0.0002,
+      "loss": 1.0737,
+      "step": 750
+    },
+    {
+      "epoch": 1.0284167794316643,
+      "grad_norm": 0.7280412912368774,
+      "learning_rate": 0.0002,
+      "loss": 1.03,
+      "step": 760
+    },
+    {
+      "epoch": 1.0419485791610283,
+      "grad_norm": 0.672149121761322,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 770
+    },
+    {
+      "epoch": 1.0554803788903924,
+      "grad_norm": 1.1186308860778809,
+      "learning_rate": 0.0002,
+      "loss": 0.997,
+      "step": 780
+    },
+    {
+      "epoch": 1.0690121786197564,
+      "grad_norm": 0.9073583483695984,
+      "learning_rate": 0.0002,
+      "loss": 0.9088,
+      "step": 790
+    },
+    {
+      "epoch": 1.0825439783491204,
+      "grad_norm": 0.6135605573654175,
+      "learning_rate": 0.0002,
+      "loss": 0.9413,
+      "step": 800
+    },
+    {
+      "epoch": 1.0960757780784844,
+      "grad_norm": 0.5854787826538086,
+      "learning_rate": 0.0002,
+      "loss": 0.9024,
+      "step": 810
+    },
+    {
+      "epoch": 1.1096075778078485,
+      "grad_norm": 0.9077727794647217,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 820
+    },
+    {
+      "epoch": 1.1231393775372125,
+      "grad_norm": 0.7072564363479614,
+      "learning_rate": 0.0002,
+      "loss": 0.9489,
+      "step": 830
+    },
+    {
+      "epoch": 1.1366711772665765,
+      "grad_norm": 0.9457924365997314,
+      "learning_rate": 0.0002,
+      "loss": 0.9275,
+      "step": 840
+    },
+    {
+      "epoch": 1.1502029769959405,
+      "grad_norm": 0.9216122031211853,
+      "learning_rate": 0.0002,
+      "loss": 0.9998,
+      "step": 850
+    },
+    {
+      "epoch": 1.1637347767253043,
+      "grad_norm": 1.0899791717529297,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 860
+    },
+    {
+      "epoch": 1.1772665764546684,
+      "grad_norm": 0.8594662547111511,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 870
+    },
+    {
+      "epoch": 1.1907983761840324,
+      "grad_norm": 0.8680914640426636,
+      "learning_rate": 0.0002,
+      "loss": 0.9513,
+      "step": 880
+    },
+    {
+      "epoch": 1.2043301759133964,
+      "grad_norm": 0.5579341650009155,
+      "learning_rate": 0.0002,
+      "loss": 0.9695,
+      "step": 890
+    },
+    {
+      "epoch": 1.2178619756427604,
+      "grad_norm": 0.8556986451148987,
+      "learning_rate": 0.0002,
+      "loss": 1.0153,
+      "step": 900
+    },
+    {
+      "epoch": 1.2313937753721245,
+      "grad_norm": 1.8943263292312622,
+      "learning_rate": 0.0002,
+      "loss": 0.9589,
+      "step": 910
+    },
+    {
+      "epoch": 1.2449255751014885,
+      "grad_norm": 0.7652221918106079,
+      "learning_rate": 0.0002,
+      "loss": 0.9554,
+      "step": 920
+    },
+    {
+      "epoch": 1.2584573748308525,
+      "grad_norm": 0.6921482086181641,
+      "learning_rate": 0.0002,
+      "loss": 0.955,
+      "step": 930
+    },
+    {
+      "epoch": 1.2719891745602165,
+      "grad_norm": 0.7211646437644958,
+      "learning_rate": 0.0002,
+      "loss": 1.0335,
+      "step": 940
+    },
+    {
+      "epoch": 1.2855209742895806,
+      "grad_norm": 0.9096421599388123,
+      "learning_rate": 0.0002,
+      "loss": 1.0597,
+      "step": 950
+    },
+    {
+      "epoch": 1.2990527740189446,
+      "grad_norm": 0.743715226650238,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 960
+    },
+    {
+      "epoch": 1.3125845737483086,
+      "grad_norm": 0.9247064590454102,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 970
+    },
+    {
+      "epoch": 1.3261163734776726,
+      "grad_norm": 1.0811798572540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9534,
+      "step": 980
+    },
+    {
+      "epoch": 1.3396481732070367,
+      "grad_norm": 0.7317015528678894,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 990
+    },
+    {
+      "epoch": 1.3531799729364005,
+      "grad_norm": 0.8399309515953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0071,
+      "step": 1000
+    },
+    {
+      "epoch": 1.3667117726657645,
+      "grad_norm": 1.094558835029602,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3802435723951285,
+      "grad_norm": 1.3759856224060059,
+      "learning_rate": 0.0002,
+      "loss": 0.8744,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3937753721244925,
+      "grad_norm": 0.8855497241020203,
+      "learning_rate": 0.0002,
+      "loss": 0.915,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4073071718538566,
+      "grad_norm": 3.6836671829223633,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1040
+    },
+    {
+      "epoch": 1.4208389715832206,
+      "grad_norm": 1.1119214296340942,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1050
+    },
+    {
+      "epoch": 1.4343707713125846,
+      "grad_norm": 0.8871118426322937,
+      "learning_rate": 0.0002,
+      "loss": 0.9381,
+      "step": 1060
+    },
+    {
+      "epoch": 1.4479025710419486,
+      "grad_norm": 0.9937213063240051,
+      "learning_rate": 0.0002,
+      "loss": 0.9091,
+      "step": 1070
+    },
+    {
+      "epoch": 1.4614343707713127,
+      "grad_norm": 0.7206485867500305,
+      "learning_rate": 0.0002,
+      "loss": 0.9923,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4749661705006765,
+      "grad_norm": 0.8442404866218567,
+      "learning_rate": 0.0002,
+      "loss": 0.951,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4884979702300405,
+      "grad_norm": 0.9265049695968628,
+      "learning_rate": 0.0002,
+      "loss": 0.8609,
+      "step": 1100
+    },
+    {
+      "epoch": 1.5020297699594045,
+      "grad_norm": 1.1033650636672974,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 1110
+    },
+    {
+      "epoch": 1.5155615696887685,
+      "grad_norm": 0.7876176834106445,
+      "learning_rate": 0.0002,
+      "loss": 1.004,
+      "step": 1120
+    },
+    {
+      "epoch": 1.5290933694181326,
+      "grad_norm": 0.7761271595954895,
+      "learning_rate": 0.0002,
+      "loss": 0.9555,
+      "step": 1130
+    },
+    {
+      "epoch": 1.5426251691474966,
+      "grad_norm": 1.0603803396224976,
+      "learning_rate": 0.0002,
+      "loss": 0.9569,
+      "step": 1140
+    },
+    {
+      "epoch": 1.5561569688768606,
+      "grad_norm": 0.7715556621551514,
+      "learning_rate": 0.0002,
+      "loss": 0.9842,
+      "step": 1150
+    },
+    {
+      "epoch": 1.5696887686062246,
+      "grad_norm": 0.6591511368751526,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5832205683355887,
+      "grad_norm": 1.1773475408554077,
+      "learning_rate": 0.0002,
+      "loss": 0.9584,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5967523680649527,
+      "grad_norm": 0.8513862490653992,
+      "learning_rate": 0.0002,
+      "loss": 0.9229,
+      "step": 1180
+    },
+    {
+      "epoch": 1.6102841677943167,
+      "grad_norm": 1.0796581506729126,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1190
+    },
+    {
+      "epoch": 1.6238159675236807,
+      "grad_norm": 0.8897230625152588,
+      "learning_rate": 0.0002,
+      "loss": 0.9698,
+      "step": 1200
+    },
+    {
+      "epoch": 1.6373477672530448,
+      "grad_norm": 1.4640971422195435,
+      "learning_rate": 0.0002,
+      "loss": 0.9295,
+      "step": 1210
+    },
+    {
+      "epoch": 1.6508795669824088,
+      "grad_norm": 1.123056173324585,
+      "learning_rate": 0.0002,
+      "loss": 1.003,
+      "step": 1220
+    },
+    {
+      "epoch": 1.6644113667117728,
+      "grad_norm": 1.1064175367355347,
+      "learning_rate": 0.0002,
+      "loss": 0.9524,
+      "step": 1230
+    },
+    {
+      "epoch": 1.6779431664411368,
+      "grad_norm": 2.4434642791748047,
+      "learning_rate": 0.0002,
+      "loss": 0.8896,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6914749661705006,
+      "grad_norm": 1.0455760955810547,
+      "learning_rate": 0.0002,
+      "loss": 0.9899,
+      "step": 1250
+    },
+    {
+      "epoch": 1.7050067658998647,
+      "grad_norm": 1.1007593870162964,
+      "learning_rate": 0.0002,
+      "loss": 0.9032,
+      "step": 1260
+    },
+    {
+      "epoch": 1.7185385656292287,
+      "grad_norm": 1.2697606086730957,
+      "learning_rate": 0.0002,
+      "loss": 0.9226,
+      "step": 1270
+    },
+    {
+      "epoch": 1.7320703653585927,
+      "grad_norm": 1.1537855863571167,
+      "learning_rate": 0.0002,
+      "loss": 0.8771,
+      "step": 1280
+    },
+    {
+      "epoch": 1.7456021650879567,
+      "grad_norm": 0.9637187719345093,
+      "learning_rate": 0.0002,
+      "loss": 0.8655,
+      "step": 1290
+    },
+    {
+      "epoch": 1.7591339648173205,
+      "grad_norm": 1.1610347032546997,
+      "learning_rate": 0.0002,
+      "loss": 0.9641,
+      "step": 1300
+    },
+    {
+      "epoch": 1.7726657645466846,
+      "grad_norm": 0.717607319355011,
+      "learning_rate": 0.0002,
+      "loss": 0.9417,
+      "step": 1310
+    },
+    {
+      "epoch": 1.7861975642760486,
+      "grad_norm": 1.753371238708496,
+      "learning_rate": 0.0002,
+      "loss": 0.8852,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7997293640054126,
+      "grad_norm": 0.7919637560844421,
+      "learning_rate": 0.0002,
+      "loss": 1.0327,
+      "step": 1330
+    },
+    {
+      "epoch": 1.8132611637347766,
+      "grad_norm": 1.1091023683547974,
+      "learning_rate": 0.0002,
+      "loss": 1.0019,
+      "step": 1340
+    },
+    {
+      "epoch": 1.8267929634641407,
+      "grad_norm": 0.7157362699508667,
+      "learning_rate": 0.0002,
+      "loss": 0.9457,
+      "step": 1350
+    },
+    {
+      "epoch": 1.8403247631935047,
+      "grad_norm": 0.9538856744766235,
+      "learning_rate": 0.0002,
+      "loss": 0.9818,
+      "step": 1360
+    },
+    {
+      "epoch": 1.8538565629228687,
+      "grad_norm": 1.689642071723938,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 1370
+    },
+    {
+      "epoch": 1.8673883626522327,
+      "grad_norm": 1.3405762910842896,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1380
+    },
+    {
+      "epoch": 1.8809201623815968,
+      "grad_norm": 1.187905192375183,
+      "learning_rate": 0.0002,
+      "loss": 0.9279,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8944519621109608,
+      "grad_norm": 1.403511643409729,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 1400
+    },
+    {
+      "epoch": 1.9079837618403248,
+      "grad_norm": 1.4245457649230957,
+      "learning_rate": 0.0002,
+      "loss": 0.9654,
+      "step": 1410
+    },
+    {
+      "epoch": 1.9215155615696888,
+      "grad_norm": 0.6742255687713623,
+      "learning_rate": 0.0002,
+      "loss": 0.9047,
+      "step": 1420
+    },
+    {
+      "epoch": 1.9350473612990529,
+      "grad_norm": 0.9301473498344421,
+      "learning_rate": 0.0002,
+      "loss": 0.9837,
+      "step": 1430
+    },
+    {
+      "epoch": 1.9485791610284169,
+      "grad_norm": 0.8039385080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 1440
+    },
+    {
+      "epoch": 1.962110960757781,
+      "grad_norm": 0.7449126839637756,
+      "learning_rate": 0.0002,
+      "loss": 0.9433,
+      "step": 1450
+    },
+    {
+      "epoch": 1.975642760487145,
+      "grad_norm": 1.8016695976257324,
+      "learning_rate": 0.0002,
+      "loss": 0.9424,
+      "step": 1460
+    },
+    {
+      "epoch": 1.989174560216509,
+      "grad_norm": 1.3347259759902954,
+      "learning_rate": 0.0002,
+      "loss": 0.9434,
+      "step": 1470
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.28184974193573,
+      "eval_runtime": 54.9872,
+      "eval_samples_per_second": 8.293,
+      "eval_steps_per_second": 1.037,
+      "step": 1478
+    },
+    {
+      "epoch": 2.002706359945873,
+      "grad_norm": 1.1238535642623901,
+      "learning_rate": 0.0002,
+      "loss": 0.8667,
+      "step": 1480
+    },
+    {
+      "epoch": 2.016238159675237,
+      "grad_norm": 0.9186404943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.741,
+      "step": 1490
+    },
+    {
+      "epoch": 2.029769959404601,
+      "grad_norm": 1.42500901222229,
+      "learning_rate": 0.0002,
+      "loss": 0.7402,
+      "step": 1500
+    },
+    {
+      "epoch": 2.0433017591339646,
+      "grad_norm": 0.9018896222114563,
+      "learning_rate": 0.0002,
+      "loss": 0.6594,
+      "step": 1510
+    },
+    {
+      "epoch": 2.0568335588633286,
+      "grad_norm": 0.9482589364051819,
+      "learning_rate": 0.0002,
+      "loss": 0.7973,
+      "step": 1520
+    },
+    {
+      "epoch": 2.0703653585926927,
+      "grad_norm": 1.7364821434020996,
+      "learning_rate": 0.0002,
+      "loss": 0.7494,
+      "step": 1530
+    },
+    {
+      "epoch": 2.0838971583220567,
+      "grad_norm": 1.1600096225738525,
+      "learning_rate": 0.0002,
+      "loss": 0.6691,
+      "step": 1540
+    },
+    {
+      "epoch": 2.0974289580514207,
+      "grad_norm": 1.1180989742279053,
+      "learning_rate": 0.0002,
+      "loss": 0.7443,
+      "step": 1550
+    },
+    {
+      "epoch": 2.1109607577807847,
+      "grad_norm": 0.7978046536445618,
+      "learning_rate": 0.0002,
+      "loss": 0.7261,
+      "step": 1560
+    },
+    {
+      "epoch": 2.1244925575101488,
+      "grad_norm": 0.9089515805244446,
+      "learning_rate": 0.0002,
+      "loss": 0.7659,
+      "step": 1570
+    },
+    {
+      "epoch": 2.138024357239513,
+      "grad_norm": 1.2802879810333252,
+      "learning_rate": 0.0002,
+      "loss": 0.7751,
+      "step": 1580
+    },
+    {
+      "epoch": 2.151556156968877,
+      "grad_norm": 1.1321839094161987,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1590
+    },
+    {
+      "epoch": 2.165087956698241,
+      "grad_norm": 0.9985150098800659,
+      "learning_rate": 0.0002,
+      "loss": 0.699,
+      "step": 1600
+    },
+    {
+      "epoch": 2.178619756427605,
+      "grad_norm": 1.1864978075027466,
+      "learning_rate": 0.0002,
+      "loss": 0.7497,
+      "step": 1610
+    },
+    {
+      "epoch": 2.192151556156969,
+      "grad_norm": 1.0220770835876465,
+      "learning_rate": 0.0002,
+      "loss": 0.7648,
+      "step": 1620
+    },
+    {
+      "epoch": 2.205683355886333,
+      "grad_norm": 1.075281023979187,
+      "learning_rate": 0.0002,
+      "loss": 0.7786,
+      "step": 1630
+    },
+    {
+      "epoch": 2.219215155615697,
+      "grad_norm": 1.7539390325546265,
+      "learning_rate": 0.0002,
+      "loss": 0.7169,
+      "step": 1640
+    },
+    {
+      "epoch": 2.232746955345061,
+      "grad_norm": 0.9327954053878784,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1650
+    },
+    {
+      "epoch": 2.246278755074425,
+      "grad_norm": 1.1239676475524902,
+      "learning_rate": 0.0002,
+      "loss": 0.7096,
+      "step": 1660
+    },
+    {
+      "epoch": 2.259810554803789,
+      "grad_norm": 0.887867271900177,
+      "learning_rate": 0.0002,
+      "loss": 0.7516,
+      "step": 1670
+    },
+    {
+      "epoch": 2.273342354533153,
+      "grad_norm": 0.9934070110321045,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1680
+    },
+    {
+      "epoch": 2.286874154262517,
+      "grad_norm": 1.1046375036239624,
+      "learning_rate": 0.0002,
+      "loss": 0.7891,
+      "step": 1690
+    },
+    {
+      "epoch": 2.300405953991881,
+      "grad_norm": 1.3520793914794922,
+      "learning_rate": 0.0002,
+      "loss": 0.7123,
+      "step": 1700
+    },
+    {
+      "epoch": 2.313937753721245,
+      "grad_norm": 1.0396424531936646,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 1710
+    },
+    {
+      "epoch": 2.3274695534506087,
+      "grad_norm": 1.312713861465454,
+      "learning_rate": 0.0002,
+      "loss": 0.7645,
+      "step": 1720
+    },
+    {
+      "epoch": 2.3410013531799727,
+      "grad_norm": 1.2425963878631592,
+      "learning_rate": 0.0002,
+      "loss": 0.7743,
+      "step": 1730
+    },
+    {
+      "epoch": 2.3545331529093367,
+      "grad_norm": 1.0335496664047241,
+      "learning_rate": 0.0002,
+      "loss": 0.7069,
+      "step": 1740
+    },
+    {
+      "epoch": 2.3680649526387008,
+      "grad_norm": 0.8289833664894104,
+      "learning_rate": 0.0002,
+      "loss": 0.763,
+      "step": 1750
+    },
+    {
+      "epoch": 2.381596752368065,
+      "grad_norm": 1.1725471019744873,
+      "learning_rate": 0.0002,
+      "loss": 0.749,
+      "step": 1760
+    },
+    {
+      "epoch": 2.395128552097429,
+      "grad_norm": 1.10824716091156,
+      "learning_rate": 0.0002,
+      "loss": 0.7842,
+      "step": 1770
+    },
+    {
+      "epoch": 2.408660351826793,
+      "grad_norm": 1.027957797050476,
+      "learning_rate": 0.0002,
+      "loss": 0.8115,
+      "step": 1780
+    },
+    {
+      "epoch": 2.422192151556157,
+      "grad_norm": 1.4744906425476074,
+      "learning_rate": 0.0002,
+      "loss": 0.7529,
+      "step": 1790
+    },
+    {
+      "epoch": 2.435723951285521,
+      "grad_norm": 2.044746160507202,
+      "learning_rate": 0.0002,
+      "loss": 0.7146,
+      "step": 1800
+    },
+    {
+      "epoch": 2.449255751014885,
+      "grad_norm": 0.9940636157989502,
+      "learning_rate": 0.0002,
+      "loss": 0.6738,
+      "step": 1810
+    },
+    {
+      "epoch": 2.462787550744249,
+      "grad_norm": 1.2338303327560425,
+      "learning_rate": 0.0002,
+      "loss": 0.8233,
+      "step": 1820
+    },
+    {
+      "epoch": 2.476319350473613,
+      "grad_norm": 1.1820061206817627,
+      "learning_rate": 0.0002,
+      "loss": 0.6409,
+      "step": 1830
+    },
+    {
+      "epoch": 2.489851150202977,
+      "grad_norm": 1.5557365417480469,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1840
+    },
+    {
+      "epoch": 2.503382949932341,
+      "grad_norm": 0.927599310874939,
+      "learning_rate": 0.0002,
+      "loss": 0.742,
+      "step": 1850
+    },
+    {
+      "epoch": 2.516914749661705,
+      "grad_norm": 1.384813666343689,
+      "learning_rate": 0.0002,
+      "loss": 0.7683,
+      "step": 1860
+    },
+    {
+      "epoch": 2.530446549391069,
+      "grad_norm": 1.0022202730178833,
+      "learning_rate": 0.0002,
+      "loss": 0.7841,
+      "step": 1870
+    },
+    {
+      "epoch": 2.543978349120433,
+      "grad_norm": 1.0271503925323486,
+      "learning_rate": 0.0002,
+      "loss": 0.7422,
+      "step": 1880
+    },
+    {
+      "epoch": 2.557510148849797,
+      "grad_norm": 1.2724764347076416,
+      "learning_rate": 0.0002,
+      "loss": 0.7513,
+      "step": 1890
+    },
+    {
+      "epoch": 2.571041948579161,
+      "grad_norm": 0.9063859581947327,
+      "learning_rate": 0.0002,
+      "loss": 0.7594,
+      "step": 1900
+    },
+    {
+      "epoch": 2.584573748308525,
+      "grad_norm": 0.9433910250663757,
+      "learning_rate": 0.0002,
+      "loss": 0.7423,
+      "step": 1910
+    },
+    {
+      "epoch": 2.598105548037889,
+      "grad_norm": 0.8303482532501221,
+      "learning_rate": 0.0002,
+      "loss": 0.6833,
+      "step": 1920
+    },
+    {
+      "epoch": 2.611637347767253,
+      "grad_norm": 1.16862952709198,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1930
+    },
+    {
+      "epoch": 2.6251691474966172,
+      "grad_norm": 0.8904703855514526,
+      "learning_rate": 0.0002,
+      "loss": 0.7544,
+      "step": 1940
+    },
+    {
+      "epoch": 2.6387009472259813,
+      "grad_norm": 1.2958505153656006,
+      "learning_rate": 0.0002,
+      "loss": 0.7339,
+      "step": 1950
+    },
+    {
+      "epoch": 2.6522327469553453,
+      "grad_norm": 1.2310389280319214,
+      "learning_rate": 0.0002,
+      "loss": 0.7502,
+      "step": 1960
+    },
+    {
+      "epoch": 2.6657645466847093,
+      "grad_norm": 1.3157947063446045,
+      "learning_rate": 0.0002,
+      "loss": 0.8305,
+      "step": 1970
+    },
+    {
+      "epoch": 2.6792963464140733,
+      "grad_norm": 0.9247841238975525,
+      "learning_rate": 0.0002,
+      "loss": 0.7348,
+      "step": 1980
+    },
+    {
+      "epoch": 2.6928281461434374,
+      "grad_norm": 0.9850119352340698,
+      "learning_rate": 0.0002,
+      "loss": 0.7352,
+      "step": 1990
+    },
+    {
+      "epoch": 2.706359945872801,
+      "grad_norm": 1.59624183177948,
+      "learning_rate": 0.0002,
+      "loss": 0.7794,
+      "step": 2000
+    },
+    {
+      "epoch": 2.719891745602165,
+      "grad_norm": 1.791932225227356,
+      "learning_rate": 0.0002,
+      "loss": 0.6918,
+      "step": 2010
+    },
+    {
+      "epoch": 2.733423545331529,
+      "grad_norm": 0.8530828356742859,
+      "learning_rate": 0.0002,
+      "loss": 0.7616,
+      "step": 2020
+    },
+    {
+      "epoch": 2.746955345060893,
+      "grad_norm": 1.431843638420105,
+      "learning_rate": 0.0002,
+      "loss": 0.8028,
+      "step": 2030
+    },
+    {
+      "epoch": 2.760487144790257,
+      "grad_norm": 1.1818324327468872,
+      "learning_rate": 0.0002,
+      "loss": 0.7403,
+      "step": 2040
+    },
+    {
+      "epoch": 2.774018944519621,
+      "grad_norm": 1.0456408262252808,
+      "learning_rate": 0.0002,
+      "loss": 0.6752,
+      "step": 2050
+    },
+    {
+      "epoch": 2.787550744248985,
+      "grad_norm": 1.5935403108596802,
+      "learning_rate": 0.0002,
+      "loss": 0.7771,
+      "step": 2060
+    },
+    {
+      "epoch": 2.801082543978349,
+      "grad_norm": 1.6653326749801636,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 2070
+    },
+    {
+      "epoch": 2.814614343707713,
+      "grad_norm": 1.2409698963165283,
+      "learning_rate": 0.0002,
+      "loss": 0.7553,
+      "step": 2080
+    },
+    {
+      "epoch": 2.828146143437077,
+      "grad_norm": 0.8511452674865723,
+      "learning_rate": 0.0002,
+      "loss": 0.7483,
+      "step": 2090
+    },
+    {
+      "epoch": 2.841677943166441,
+      "grad_norm": 1.1064083576202393,
+      "learning_rate": 0.0002,
+      "loss": 0.7773,
+      "step": 2100
+    },
+    {
+      "epoch": 2.855209742895805,
+      "grad_norm": 1.5252450704574585,
+      "learning_rate": 0.0002,
+      "loss": 0.8025,
+      "step": 2110
+    },
+    {
+      "epoch": 2.8687415426251692,
+      "grad_norm": 1.8477630615234375,
+      "learning_rate": 0.0002,
+      "loss": 0.7342,
+      "step": 2120
+    },
+    {
+      "epoch": 2.8822733423545333,
+      "grad_norm": 1.8251630067825317,
+      "learning_rate": 0.0002,
+      "loss": 1.0005,
+      "step": 2130
+    },
+    {
+      "epoch": 2.8958051420838973,
+      "grad_norm": 2.0696771144866943,
+      "learning_rate": 0.0002,
+      "loss": 0.8036,
+      "step": 2140
+    },
+    {
+      "epoch": 2.9093369418132613,
+      "grad_norm": 1.0875508785247803,
+      "learning_rate": 0.0002,
+      "loss": 0.7561,
+      "step": 2150
+    },
+    {
+      "epoch": 2.9228687415426253,
+      "grad_norm": 1.1486080884933472,
+      "learning_rate": 0.0002,
+      "loss": 0.808,
+      "step": 2160
+    },
+    {
+      "epoch": 2.936400541271989,
+      "grad_norm": 1.3913694620132446,
+      "learning_rate": 0.0002,
+      "loss": 0.7418,
+      "step": 2170
+    },
+    {
+      "epoch": 2.949932341001353,
+      "grad_norm": 1.0237643718719482,
+      "learning_rate": 0.0002,
+      "loss": 0.7829,
+      "step": 2180
+    },
+    {
+      "epoch": 2.963464140730717,
+      "grad_norm": 1.0060926675796509,
+      "learning_rate": 0.0002,
+      "loss": 0.8717,
+      "step": 2190
+    },
+    {
+      "epoch": 2.976995940460081,
+      "grad_norm": 0.8395462036132812,
+      "learning_rate": 0.0002,
+      "loss": 0.7581,
+      "step": 2200
+    },
+    {
+      "epoch": 2.990527740189445,
+      "grad_norm": 1.4483158588409424,
+      "learning_rate": 0.0002,
+      "loss": 0.7032,
+      "step": 2210
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.3191189765930176,
+      "eval_runtime": 53.5323,
+      "eval_samples_per_second": 8.518,
+      "eval_steps_per_second": 1.065,
+      "step": 2217
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5912,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.72702691098624e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c0e60ee7ec9d35429ff5330b6a72628063da6a9c843e01fa8c503c2ba1303f
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72b58f2791fa301857eaa8a2b70187927d155805814dd8be0af3e319768648f3
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21bbc23498ce7573081eccfcee8cf267af89dc4d66e25ed2ea3c5ffefa285f68
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:079fe5b9e8e96e9ef6bf1832a86befe5276c9afc597368c3f79a7b0bcff32ed0
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02045bcc359785ae61a174e4f10d5475b7b1ad0149f88409bcaa1bb9394ff240
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2130 @@

+{
+  "best_metric": 1.28184974193573,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 2956,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013531799729364006,
+      "grad_norm": 1.1050695180892944,
+      "learning_rate": 0.0002,
+      "loss": 1.9104,
+      "step": 10
+    },
+    {
+      "epoch": 0.02706359945872801,
+      "grad_norm": 1.2463239431381226,
+      "learning_rate": 0.0002,
+      "loss": 1.6832,
+      "step": 20
+    },
+    {
+      "epoch": 0.04059539918809202,
+      "grad_norm": 1.193475604057312,
+      "learning_rate": 0.0002,
+      "loss": 1.4273,
+      "step": 30
+    },
+    {
+      "epoch": 0.05412719891745602,
+      "grad_norm": 0.7777265310287476,
+      "learning_rate": 0.0002,
+      "loss": 1.4369,
+      "step": 40
+    },
+    {
+      "epoch": 0.06765899864682003,
+      "grad_norm": 1.4079619646072388,
+      "learning_rate": 0.0002,
+      "loss": 1.3618,
+      "step": 50
+    },
+    {
+      "epoch": 0.08119079837618404,
+      "grad_norm": 0.9451370239257812,
+      "learning_rate": 0.0002,
+      "loss": 1.2639,
+      "step": 60
+    },
+    {
+      "epoch": 0.09472259810554803,
+      "grad_norm": 0.7137989401817322,
+      "learning_rate": 0.0002,
+      "loss": 1.2494,
+      "step": 70
+    },
+    {
+      "epoch": 0.10825439783491204,
+      "grad_norm": 0.5521688461303711,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 80
+    },
+    {
+      "epoch": 0.12178619756427606,
+      "grad_norm": 0.8050252795219421,
+      "learning_rate": 0.0002,
+      "loss": 1.2882,
+      "step": 90
+    },
+    {
+      "epoch": 0.13531799729364005,
+      "grad_norm": 0.6771517992019653,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 100
+    },
+    {
+      "epoch": 0.14884979702300405,
+      "grad_norm": 0.8056462407112122,
+      "learning_rate": 0.0002,
+      "loss": 1.2265,
+      "step": 110
+    },
+    {
+      "epoch": 0.16238159675236807,
+      "grad_norm": 0.7240928411483765,
+      "learning_rate": 0.0002,
+      "loss": 1.3218,
+      "step": 120
+    },
+    {
+      "epoch": 0.17591339648173207,
+      "grad_norm": 0.5310224294662476,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 130
+    },
+    {
+      "epoch": 0.18944519621109607,
+      "grad_norm": 0.5730571150779724,
+      "learning_rate": 0.0002,
+      "loss": 1.2196,
+      "step": 140
+    },
+    {
+      "epoch": 0.2029769959404601,
+      "grad_norm": 0.5288769006729126,
+      "learning_rate": 0.0002,
+      "loss": 1.3183,
+      "step": 150
+    },
+    {
+      "epoch": 0.2165087956698241,
+      "grad_norm": 0.5447443723678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1886,
+      "step": 160
+    },
+    {
+      "epoch": 0.23004059539918809,
+      "grad_norm": 2.2740917205810547,
+      "learning_rate": 0.0002,
+      "loss": 1.1994,
+      "step": 170
+    },
+    {
+      "epoch": 0.2435723951285521,
+      "grad_norm": 0.6200950741767883,
+      "learning_rate": 0.0002,
+      "loss": 1.2398,
+      "step": 180
+    },
+    {
+      "epoch": 0.2571041948579161,
+      "grad_norm": 1.1782610416412354,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 190
+    },
+    {
+      "epoch": 0.2706359945872801,
+      "grad_norm": 0.7893068194389343,
+      "learning_rate": 0.0002,
+      "loss": 1.1697,
+      "step": 200
+    },
+    {
+      "epoch": 0.28416779431664413,
+      "grad_norm": 0.5421761274337769,
+      "learning_rate": 0.0002,
+      "loss": 1.221,
+      "step": 210
+    },
+    {
+      "epoch": 0.2976995940460081,
+      "grad_norm": 0.5460169315338135,
+      "learning_rate": 0.0002,
+      "loss": 1.2019,
+      "step": 220
+    },
+    {
+      "epoch": 0.3112313937753721,
+      "grad_norm": 0.5957289934158325,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 230
+    },
+    {
+      "epoch": 0.32476319350473615,
+      "grad_norm": 0.745093047618866,
+      "learning_rate": 0.0002,
+      "loss": 1.2545,
+      "step": 240
+    },
+    {
+      "epoch": 0.3382949932341001,
+      "grad_norm": 0.622627317905426,
+      "learning_rate": 0.0002,
+      "loss": 1.2408,
+      "step": 250
+    },
+    {
+      "epoch": 0.35182679296346414,
+      "grad_norm": 0.5773138999938965,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 260
+    },
+    {
+      "epoch": 0.36535859269282817,
+      "grad_norm": 1.104275107383728,
+      "learning_rate": 0.0002,
+      "loss": 1.0475,
+      "step": 270
+    },
+    {
+      "epoch": 0.37889039242219213,
+      "grad_norm": 0.5755344033241272,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 280
+    },
+    {
+      "epoch": 0.39242219215155616,
+      "grad_norm": 0.5885311961174011,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 290
+    },
+    {
+      "epoch": 0.4059539918809202,
+      "grad_norm": 1.448182225227356,
+      "learning_rate": 0.0002,
+      "loss": 1.2099,
+      "step": 300
+    },
+    {
+      "epoch": 0.41948579161028415,
+      "grad_norm": 0.5983599424362183,
+      "learning_rate": 0.0002,
+      "loss": 1.2114,
+      "step": 310
+    },
+    {
+      "epoch": 0.4330175913396482,
+      "grad_norm": 0.5013539791107178,
+      "learning_rate": 0.0002,
+      "loss": 1.1691,
+      "step": 320
+    },
+    {
+      "epoch": 0.4465493910690122,
+      "grad_norm": 0.8935738205909729,
+      "learning_rate": 0.0002,
+      "loss": 1.138,
+      "step": 330
+    },
+    {
+      "epoch": 0.46008119079837617,
+      "grad_norm": 0.5642115473747253,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 340
+    },
+    {
+      "epoch": 0.4736129905277402,
+      "grad_norm": 0.7897255420684814,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 350
+    },
+    {
+      "epoch": 0.4871447902571042,
+      "grad_norm": 1.6891459226608276,
+      "learning_rate": 0.0002,
+      "loss": 1.1067,
+      "step": 360
+    },
+    {
+      "epoch": 0.5006765899864682,
+      "grad_norm": 1.1374807357788086,
+      "learning_rate": 0.0002,
+      "loss": 1.1787,
+      "step": 370
+    },
+    {
+      "epoch": 0.5142083897158322,
+      "grad_norm": 0.5355549454689026,
+      "learning_rate": 0.0002,
+      "loss": 1.1525,
+      "step": 380
+    },
+    {
+      "epoch": 0.5277401894451962,
+      "grad_norm": 0.656196653842926,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 390
+    },
+    {
+      "epoch": 0.5412719891745602,
+      "grad_norm": 0.8692356944084167,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 400
+    },
+    {
+      "epoch": 0.5548037889039242,
+      "grad_norm": 0.5873697400093079,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 410
+    },
+    {
+      "epoch": 0.5683355886332883,
+      "grad_norm": 0.8922758102416992,
+      "learning_rate": 0.0002,
+      "loss": 1.1841,
+      "step": 420
+    },
+    {
+      "epoch": 0.5818673883626523,
+      "grad_norm": 0.5048012733459473,
+      "learning_rate": 0.0002,
+      "loss": 1.184,
+      "step": 430
+    },
+    {
+      "epoch": 0.5953991880920162,
+      "grad_norm": 0.603631317615509,
+      "learning_rate": 0.0002,
+      "loss": 1.3276,
+      "step": 440
+    },
+    {
+      "epoch": 0.6089309878213802,
+      "grad_norm": 0.5635734796524048,
+      "learning_rate": 0.0002,
+      "loss": 1.0459,
+      "step": 450
+    },
+    {
+      "epoch": 0.6224627875507442,
+      "grad_norm": 0.7483186721801758,
+      "learning_rate": 0.0002,
+      "loss": 1.1549,
+      "step": 460
+    },
+    {
+      "epoch": 0.6359945872801083,
+      "grad_norm": 0.5372456312179565,
+      "learning_rate": 0.0002,
+      "loss": 1.1141,
+      "step": 470
+    },
+    {
+      "epoch": 0.6495263870094723,
+      "grad_norm": 0.9461246728897095,
+      "learning_rate": 0.0002,
+      "loss": 1.2966,
+      "step": 480
+    },
+    {
+      "epoch": 0.6630581867388363,
+      "grad_norm": 0.7017379403114319,
+      "learning_rate": 0.0002,
+      "loss": 1.1226,
+      "step": 490
+    },
+    {
+      "epoch": 0.6765899864682002,
+      "grad_norm": 1.6177887916564941,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 500
+    },
+    {
+      "epoch": 0.6901217861975643,
+      "grad_norm": 0.4857328236103058,
+      "learning_rate": 0.0002,
+      "loss": 1.0716,
+      "step": 510
+    },
+    {
+      "epoch": 0.7036535859269283,
+      "grad_norm": 1.0545706748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.2898,
+      "step": 520
+    },
+    {
+      "epoch": 0.7171853856562923,
+      "grad_norm": 0.7486541867256165,
+      "learning_rate": 0.0002,
+      "loss": 1.2527,
+      "step": 530
+    },
+    {
+      "epoch": 0.7307171853856563,
+      "grad_norm": 0.6588427424430847,
+      "learning_rate": 0.0002,
+      "loss": 1.0634,
+      "step": 540
+    },
+    {
+      "epoch": 0.7442489851150202,
+      "grad_norm": 0.9485914707183838,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 550
+    },
+    {
+      "epoch": 0.7577807848443843,
+      "grad_norm": 0.7465947866439819,
+      "learning_rate": 0.0002,
+      "loss": 1.1933,
+      "step": 560
+    },
+    {
+      "epoch": 0.7713125845737483,
+      "grad_norm": 0.6392837166786194,
+      "learning_rate": 0.0002,
+      "loss": 1.0927,
+      "step": 570
+    },
+    {
+      "epoch": 0.7848443843031123,
+      "grad_norm": 0.4898282587528229,
+      "learning_rate": 0.0002,
+      "loss": 1.14,
+      "step": 580
+    },
+    {
+      "epoch": 0.7983761840324763,
+      "grad_norm": 0.5636171102523804,
+      "learning_rate": 0.0002,
+      "loss": 1.0425,
+      "step": 590
+    },
+    {
+      "epoch": 0.8119079837618404,
+      "grad_norm": 0.6637675166130066,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 600
+    },
+    {
+      "epoch": 0.8254397834912043,
+      "grad_norm": 1.1842738389968872,
+      "learning_rate": 0.0002,
+      "loss": 1.1204,
+      "step": 610
+    },
+    {
+      "epoch": 0.8389715832205683,
+      "grad_norm": 0.5699004530906677,
+      "learning_rate": 0.0002,
+      "loss": 1.083,
+      "step": 620
+    },
+    {
+      "epoch": 0.8525033829499323,
+      "grad_norm": 0.7748669385910034,
+      "learning_rate": 0.0002,
+      "loss": 1.153,
+      "step": 630
+    },
+    {
+      "epoch": 0.8660351826792964,
+      "grad_norm": 0.7987180352210999,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 640
+    },
+    {
+      "epoch": 0.8795669824086604,
+      "grad_norm": 1.0740629434585571,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 650
+    },
+    {
+      "epoch": 0.8930987821380244,
+      "grad_norm": 0.731082022190094,
+      "learning_rate": 0.0002,
+      "loss": 1.0503,
+      "step": 660
+    },
+    {
+      "epoch": 0.9066305818673883,
+      "grad_norm": 0.9066846966743469,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 670
+    },
+    {
+      "epoch": 0.9201623815967523,
+      "grad_norm": 0.9934597015380859,
+      "learning_rate": 0.0002,
+      "loss": 1.0516,
+      "step": 680
+    },
+    {
+      "epoch": 0.9336941813261164,
+      "grad_norm": 0.7975896000862122,
+      "learning_rate": 0.0002,
+      "loss": 1.0983,
+      "step": 690
+    },
+    {
+      "epoch": 0.9472259810554804,
+      "grad_norm": 0.9127744436264038,
+      "learning_rate": 0.0002,
+      "loss": 1.2362,
+      "step": 700
+    },
+    {
+      "epoch": 0.9607577807848444,
+      "grad_norm": 0.7682064175605774,
+      "learning_rate": 0.0002,
+      "loss": 1.0273,
+      "step": 710
+    },
+    {
+      "epoch": 0.9742895805142084,
+      "grad_norm": 0.9808696508407593,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 720
+    },
+    {
+      "epoch": 0.9878213802435724,
+      "grad_norm": 1.0826992988586426,
+      "learning_rate": 0.0002,
+      "loss": 1.0979,
+      "step": 730
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2867646217346191,
+      "eval_runtime": 53.6569,
+      "eval_samples_per_second": 8.498,
+      "eval_steps_per_second": 1.062,
+      "step": 739
+    },
+    {
+      "epoch": 1.0013531799729365,
+      "grad_norm": 0.6498591303825378,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 740
+    },
+    {
+      "epoch": 1.0148849797023005,
+      "grad_norm": 0.8040738701820374,
+      "learning_rate": 0.0002,
+      "loss": 1.0737,
+      "step": 750
+    },
+    {
+      "epoch": 1.0284167794316643,
+      "grad_norm": 0.7280412912368774,
+      "learning_rate": 0.0002,
+      "loss": 1.03,
+      "step": 760
+    },
+    {
+      "epoch": 1.0419485791610283,
+      "grad_norm": 0.672149121761322,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 770
+    },
+    {
+      "epoch": 1.0554803788903924,
+      "grad_norm": 1.1186308860778809,
+      "learning_rate": 0.0002,
+      "loss": 0.997,
+      "step": 780
+    },
+    {
+      "epoch": 1.0690121786197564,
+      "grad_norm": 0.9073583483695984,
+      "learning_rate": 0.0002,
+      "loss": 0.9088,
+      "step": 790
+    },
+    {
+      "epoch": 1.0825439783491204,
+      "grad_norm": 0.6135605573654175,
+      "learning_rate": 0.0002,
+      "loss": 0.9413,
+      "step": 800
+    },
+    {
+      "epoch": 1.0960757780784844,
+      "grad_norm": 0.5854787826538086,
+      "learning_rate": 0.0002,
+      "loss": 0.9024,
+      "step": 810
+    },
+    {
+      "epoch": 1.1096075778078485,
+      "grad_norm": 0.9077727794647217,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 820
+    },
+    {
+      "epoch": 1.1231393775372125,
+      "grad_norm": 0.7072564363479614,
+      "learning_rate": 0.0002,
+      "loss": 0.9489,
+      "step": 830
+    },
+    {
+      "epoch": 1.1366711772665765,
+      "grad_norm": 0.9457924365997314,
+      "learning_rate": 0.0002,
+      "loss": 0.9275,
+      "step": 840
+    },
+    {
+      "epoch": 1.1502029769959405,
+      "grad_norm": 0.9216122031211853,
+      "learning_rate": 0.0002,
+      "loss": 0.9998,
+      "step": 850
+    },
+    {
+      "epoch": 1.1637347767253043,
+      "grad_norm": 1.0899791717529297,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 860
+    },
+    {
+      "epoch": 1.1772665764546684,
+      "grad_norm": 0.8594662547111511,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 870
+    },
+    {
+      "epoch": 1.1907983761840324,
+      "grad_norm": 0.8680914640426636,
+      "learning_rate": 0.0002,
+      "loss": 0.9513,
+      "step": 880
+    },
+    {
+      "epoch": 1.2043301759133964,
+      "grad_norm": 0.5579341650009155,
+      "learning_rate": 0.0002,
+      "loss": 0.9695,
+      "step": 890
+    },
+    {
+      "epoch": 1.2178619756427604,
+      "grad_norm": 0.8556986451148987,
+      "learning_rate": 0.0002,
+      "loss": 1.0153,
+      "step": 900
+    },
+    {
+      "epoch": 1.2313937753721245,
+      "grad_norm": 1.8943263292312622,
+      "learning_rate": 0.0002,
+      "loss": 0.9589,
+      "step": 910
+    },
+    {
+      "epoch": 1.2449255751014885,
+      "grad_norm": 0.7652221918106079,
+      "learning_rate": 0.0002,
+      "loss": 0.9554,
+      "step": 920
+    },
+    {
+      "epoch": 1.2584573748308525,
+      "grad_norm": 0.6921482086181641,
+      "learning_rate": 0.0002,
+      "loss": 0.955,
+      "step": 930
+    },
+    {
+      "epoch": 1.2719891745602165,
+      "grad_norm": 0.7211646437644958,
+      "learning_rate": 0.0002,
+      "loss": 1.0335,
+      "step": 940
+    },
+    {
+      "epoch": 1.2855209742895806,
+      "grad_norm": 0.9096421599388123,
+      "learning_rate": 0.0002,
+      "loss": 1.0597,
+      "step": 950
+    },
+    {
+      "epoch": 1.2990527740189446,
+      "grad_norm": 0.743715226650238,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 960
+    },
+    {
+      "epoch": 1.3125845737483086,
+      "grad_norm": 0.9247064590454102,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 970
+    },
+    {
+      "epoch": 1.3261163734776726,
+      "grad_norm": 1.0811798572540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9534,
+      "step": 980
+    },
+    {
+      "epoch": 1.3396481732070367,
+      "grad_norm": 0.7317015528678894,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 990
+    },
+    {
+      "epoch": 1.3531799729364005,
+      "grad_norm": 0.8399309515953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0071,
+      "step": 1000
+    },
+    {
+      "epoch": 1.3667117726657645,
+      "grad_norm": 1.094558835029602,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3802435723951285,
+      "grad_norm": 1.3759856224060059,
+      "learning_rate": 0.0002,
+      "loss": 0.8744,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3937753721244925,
+      "grad_norm": 0.8855497241020203,
+      "learning_rate": 0.0002,
+      "loss": 0.915,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4073071718538566,
+      "grad_norm": 3.6836671829223633,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1040
+    },
+    {
+      "epoch": 1.4208389715832206,
+      "grad_norm": 1.1119214296340942,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1050
+    },
+    {
+      "epoch": 1.4343707713125846,
+      "grad_norm": 0.8871118426322937,
+      "learning_rate": 0.0002,
+      "loss": 0.9381,
+      "step": 1060
+    },
+    {
+      "epoch": 1.4479025710419486,
+      "grad_norm": 0.9937213063240051,
+      "learning_rate": 0.0002,
+      "loss": 0.9091,
+      "step": 1070
+    },
+    {
+      "epoch": 1.4614343707713127,
+      "grad_norm": 0.7206485867500305,
+      "learning_rate": 0.0002,
+      "loss": 0.9923,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4749661705006765,
+      "grad_norm": 0.8442404866218567,
+      "learning_rate": 0.0002,
+      "loss": 0.951,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4884979702300405,
+      "grad_norm": 0.9265049695968628,
+      "learning_rate": 0.0002,
+      "loss": 0.8609,
+      "step": 1100
+    },
+    {
+      "epoch": 1.5020297699594045,
+      "grad_norm": 1.1033650636672974,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 1110
+    },
+    {
+      "epoch": 1.5155615696887685,
+      "grad_norm": 0.7876176834106445,
+      "learning_rate": 0.0002,
+      "loss": 1.004,
+      "step": 1120
+    },
+    {
+      "epoch": 1.5290933694181326,
+      "grad_norm": 0.7761271595954895,
+      "learning_rate": 0.0002,
+      "loss": 0.9555,
+      "step": 1130
+    },
+    {
+      "epoch": 1.5426251691474966,
+      "grad_norm": 1.0603803396224976,
+      "learning_rate": 0.0002,
+      "loss": 0.9569,
+      "step": 1140
+    },
+    {
+      "epoch": 1.5561569688768606,
+      "grad_norm": 0.7715556621551514,
+      "learning_rate": 0.0002,
+      "loss": 0.9842,
+      "step": 1150
+    },
+    {
+      "epoch": 1.5696887686062246,
+      "grad_norm": 0.6591511368751526,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5832205683355887,
+      "grad_norm": 1.1773475408554077,
+      "learning_rate": 0.0002,
+      "loss": 0.9584,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5967523680649527,
+      "grad_norm": 0.8513862490653992,
+      "learning_rate": 0.0002,
+      "loss": 0.9229,
+      "step": 1180
+    },
+    {
+      "epoch": 1.6102841677943167,
+      "grad_norm": 1.0796581506729126,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1190
+    },
+    {
+      "epoch": 1.6238159675236807,
+      "grad_norm": 0.8897230625152588,
+      "learning_rate": 0.0002,
+      "loss": 0.9698,
+      "step": 1200
+    },
+    {
+      "epoch": 1.6373477672530448,
+      "grad_norm": 1.4640971422195435,
+      "learning_rate": 0.0002,
+      "loss": 0.9295,
+      "step": 1210
+    },
+    {
+      "epoch": 1.6508795669824088,
+      "grad_norm": 1.123056173324585,
+      "learning_rate": 0.0002,
+      "loss": 1.003,
+      "step": 1220
+    },
+    {
+      "epoch": 1.6644113667117728,
+      "grad_norm": 1.1064175367355347,
+      "learning_rate": 0.0002,
+      "loss": 0.9524,
+      "step": 1230
+    },
+    {
+      "epoch": 1.6779431664411368,
+      "grad_norm": 2.4434642791748047,
+      "learning_rate": 0.0002,
+      "loss": 0.8896,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6914749661705006,
+      "grad_norm": 1.0455760955810547,
+      "learning_rate": 0.0002,
+      "loss": 0.9899,
+      "step": 1250
+    },
+    {
+      "epoch": 1.7050067658998647,
+      "grad_norm": 1.1007593870162964,
+      "learning_rate": 0.0002,
+      "loss": 0.9032,
+      "step": 1260
+    },
+    {
+      "epoch": 1.7185385656292287,
+      "grad_norm": 1.2697606086730957,
+      "learning_rate": 0.0002,
+      "loss": 0.9226,
+      "step": 1270
+    },
+    {
+      "epoch": 1.7320703653585927,
+      "grad_norm": 1.1537855863571167,
+      "learning_rate": 0.0002,
+      "loss": 0.8771,
+      "step": 1280
+    },
+    {
+      "epoch": 1.7456021650879567,
+      "grad_norm": 0.9637187719345093,
+      "learning_rate": 0.0002,
+      "loss": 0.8655,
+      "step": 1290
+    },
+    {
+      "epoch": 1.7591339648173205,
+      "grad_norm": 1.1610347032546997,
+      "learning_rate": 0.0002,
+      "loss": 0.9641,
+      "step": 1300
+    },
+    {
+      "epoch": 1.7726657645466846,
+      "grad_norm": 0.717607319355011,
+      "learning_rate": 0.0002,
+      "loss": 0.9417,
+      "step": 1310
+    },
+    {
+      "epoch": 1.7861975642760486,
+      "grad_norm": 1.753371238708496,
+      "learning_rate": 0.0002,
+      "loss": 0.8852,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7997293640054126,
+      "grad_norm": 0.7919637560844421,
+      "learning_rate": 0.0002,
+      "loss": 1.0327,
+      "step": 1330
+    },
+    {
+      "epoch": 1.8132611637347766,
+      "grad_norm": 1.1091023683547974,
+      "learning_rate": 0.0002,
+      "loss": 1.0019,
+      "step": 1340
+    },
+    {
+      "epoch": 1.8267929634641407,
+      "grad_norm": 0.7157362699508667,
+      "learning_rate": 0.0002,
+      "loss": 0.9457,
+      "step": 1350
+    },
+    {
+      "epoch": 1.8403247631935047,
+      "grad_norm": 0.9538856744766235,
+      "learning_rate": 0.0002,
+      "loss": 0.9818,
+      "step": 1360
+    },
+    {
+      "epoch": 1.8538565629228687,
+      "grad_norm": 1.689642071723938,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 1370
+    },
+    {
+      "epoch": 1.8673883626522327,
+      "grad_norm": 1.3405762910842896,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1380
+    },
+    {
+      "epoch": 1.8809201623815968,
+      "grad_norm": 1.187905192375183,
+      "learning_rate": 0.0002,
+      "loss": 0.9279,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8944519621109608,
+      "grad_norm": 1.403511643409729,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 1400
+    },
+    {
+      "epoch": 1.9079837618403248,
+      "grad_norm": 1.4245457649230957,
+      "learning_rate": 0.0002,
+      "loss": 0.9654,
+      "step": 1410
+    },
+    {
+      "epoch": 1.9215155615696888,
+      "grad_norm": 0.6742255687713623,
+      "learning_rate": 0.0002,
+      "loss": 0.9047,
+      "step": 1420
+    },
+    {
+      "epoch": 1.9350473612990529,
+      "grad_norm": 0.9301473498344421,
+      "learning_rate": 0.0002,
+      "loss": 0.9837,
+      "step": 1430
+    },
+    {
+      "epoch": 1.9485791610284169,
+      "grad_norm": 0.8039385080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 1440
+    },
+    {
+      "epoch": 1.962110960757781,
+      "grad_norm": 0.7449126839637756,
+      "learning_rate": 0.0002,
+      "loss": 0.9433,
+      "step": 1450
+    },
+    {
+      "epoch": 1.975642760487145,
+      "grad_norm": 1.8016695976257324,
+      "learning_rate": 0.0002,
+      "loss": 0.9424,
+      "step": 1460
+    },
+    {
+      "epoch": 1.989174560216509,
+      "grad_norm": 1.3347259759902954,
+      "learning_rate": 0.0002,
+      "loss": 0.9434,
+      "step": 1470
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.28184974193573,
+      "eval_runtime": 54.9872,
+      "eval_samples_per_second": 8.293,
+      "eval_steps_per_second": 1.037,
+      "step": 1478
+    },
+    {
+      "epoch": 2.002706359945873,
+      "grad_norm": 1.1238535642623901,
+      "learning_rate": 0.0002,
+      "loss": 0.8667,
+      "step": 1480
+    },
+    {
+      "epoch": 2.016238159675237,
+      "grad_norm": 0.9186404943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.741,
+      "step": 1490
+    },
+    {
+      "epoch": 2.029769959404601,
+      "grad_norm": 1.42500901222229,
+      "learning_rate": 0.0002,
+      "loss": 0.7402,
+      "step": 1500
+    },
+    {
+      "epoch": 2.0433017591339646,
+      "grad_norm": 0.9018896222114563,
+      "learning_rate": 0.0002,
+      "loss": 0.6594,
+      "step": 1510
+    },
+    {
+      "epoch": 2.0568335588633286,
+      "grad_norm": 0.9482589364051819,
+      "learning_rate": 0.0002,
+      "loss": 0.7973,
+      "step": 1520
+    },
+    {
+      "epoch": 2.0703653585926927,
+      "grad_norm": 1.7364821434020996,
+      "learning_rate": 0.0002,
+      "loss": 0.7494,
+      "step": 1530
+    },
+    {
+      "epoch": 2.0838971583220567,
+      "grad_norm": 1.1600096225738525,
+      "learning_rate": 0.0002,
+      "loss": 0.6691,
+      "step": 1540
+    },
+    {
+      "epoch": 2.0974289580514207,
+      "grad_norm": 1.1180989742279053,
+      "learning_rate": 0.0002,
+      "loss": 0.7443,
+      "step": 1550
+    },
+    {
+      "epoch": 2.1109607577807847,
+      "grad_norm": 0.7978046536445618,
+      "learning_rate": 0.0002,
+      "loss": 0.7261,
+      "step": 1560
+    },
+    {
+      "epoch": 2.1244925575101488,
+      "grad_norm": 0.9089515805244446,
+      "learning_rate": 0.0002,
+      "loss": 0.7659,
+      "step": 1570
+    },
+    {
+      "epoch": 2.138024357239513,
+      "grad_norm": 1.2802879810333252,
+      "learning_rate": 0.0002,
+      "loss": 0.7751,
+      "step": 1580
+    },
+    {
+      "epoch": 2.151556156968877,
+      "grad_norm": 1.1321839094161987,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1590
+    },
+    {
+      "epoch": 2.165087956698241,
+      "grad_norm": 0.9985150098800659,
+      "learning_rate": 0.0002,
+      "loss": 0.699,
+      "step": 1600
+    },
+    {
+      "epoch": 2.178619756427605,
+      "grad_norm": 1.1864978075027466,
+      "learning_rate": 0.0002,
+      "loss": 0.7497,
+      "step": 1610
+    },
+    {
+      "epoch": 2.192151556156969,
+      "grad_norm": 1.0220770835876465,
+      "learning_rate": 0.0002,
+      "loss": 0.7648,
+      "step": 1620
+    },
+    {
+      "epoch": 2.205683355886333,
+      "grad_norm": 1.075281023979187,
+      "learning_rate": 0.0002,
+      "loss": 0.7786,
+      "step": 1630
+    },
+    {
+      "epoch": 2.219215155615697,
+      "grad_norm": 1.7539390325546265,
+      "learning_rate": 0.0002,
+      "loss": 0.7169,
+      "step": 1640
+    },
+    {
+      "epoch": 2.232746955345061,
+      "grad_norm": 0.9327954053878784,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1650
+    },
+    {
+      "epoch": 2.246278755074425,
+      "grad_norm": 1.1239676475524902,
+      "learning_rate": 0.0002,
+      "loss": 0.7096,
+      "step": 1660
+    },
+    {
+      "epoch": 2.259810554803789,
+      "grad_norm": 0.887867271900177,
+      "learning_rate": 0.0002,
+      "loss": 0.7516,
+      "step": 1670
+    },
+    {
+      "epoch": 2.273342354533153,
+      "grad_norm": 0.9934070110321045,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1680
+    },
+    {
+      "epoch": 2.286874154262517,
+      "grad_norm": 1.1046375036239624,
+      "learning_rate": 0.0002,
+      "loss": 0.7891,
+      "step": 1690
+    },
+    {
+      "epoch": 2.300405953991881,
+      "grad_norm": 1.3520793914794922,
+      "learning_rate": 0.0002,
+      "loss": 0.7123,
+      "step": 1700
+    },
+    {
+      "epoch": 2.313937753721245,
+      "grad_norm": 1.0396424531936646,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 1710
+    },
+    {
+      "epoch": 2.3274695534506087,
+      "grad_norm": 1.312713861465454,
+      "learning_rate": 0.0002,
+      "loss": 0.7645,
+      "step": 1720
+    },
+    {
+      "epoch": 2.3410013531799727,
+      "grad_norm": 1.2425963878631592,
+      "learning_rate": 0.0002,
+      "loss": 0.7743,
+      "step": 1730
+    },
+    {
+      "epoch": 2.3545331529093367,
+      "grad_norm": 1.0335496664047241,
+      "learning_rate": 0.0002,
+      "loss": 0.7069,
+      "step": 1740
+    },
+    {
+      "epoch": 2.3680649526387008,
+      "grad_norm": 0.8289833664894104,
+      "learning_rate": 0.0002,
+      "loss": 0.763,
+      "step": 1750
+    },
+    {
+      "epoch": 2.381596752368065,
+      "grad_norm": 1.1725471019744873,
+      "learning_rate": 0.0002,
+      "loss": 0.749,
+      "step": 1760
+    },
+    {
+      "epoch": 2.395128552097429,
+      "grad_norm": 1.10824716091156,
+      "learning_rate": 0.0002,
+      "loss": 0.7842,
+      "step": 1770
+    },
+    {
+      "epoch": 2.408660351826793,
+      "grad_norm": 1.027957797050476,
+      "learning_rate": 0.0002,
+      "loss": 0.8115,
+      "step": 1780
+    },
+    {
+      "epoch": 2.422192151556157,
+      "grad_norm": 1.4744906425476074,
+      "learning_rate": 0.0002,
+      "loss": 0.7529,
+      "step": 1790
+    },
+    {
+      "epoch": 2.435723951285521,
+      "grad_norm": 2.044746160507202,
+      "learning_rate": 0.0002,
+      "loss": 0.7146,
+      "step": 1800
+    },
+    {
+      "epoch": 2.449255751014885,
+      "grad_norm": 0.9940636157989502,
+      "learning_rate": 0.0002,
+      "loss": 0.6738,
+      "step": 1810
+    },
+    {
+      "epoch": 2.462787550744249,
+      "grad_norm": 1.2338303327560425,
+      "learning_rate": 0.0002,
+      "loss": 0.8233,
+      "step": 1820
+    },
+    {
+      "epoch": 2.476319350473613,
+      "grad_norm": 1.1820061206817627,
+      "learning_rate": 0.0002,
+      "loss": 0.6409,
+      "step": 1830
+    },
+    {
+      "epoch": 2.489851150202977,
+      "grad_norm": 1.5557365417480469,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1840
+    },
+    {
+      "epoch": 2.503382949932341,
+      "grad_norm": 0.927599310874939,
+      "learning_rate": 0.0002,
+      "loss": 0.742,
+      "step": 1850
+    },
+    {
+      "epoch": 2.516914749661705,
+      "grad_norm": 1.384813666343689,
+      "learning_rate": 0.0002,
+      "loss": 0.7683,
+      "step": 1860
+    },
+    {
+      "epoch": 2.530446549391069,
+      "grad_norm": 1.0022202730178833,
+      "learning_rate": 0.0002,
+      "loss": 0.7841,
+      "step": 1870
+    },
+    {
+      "epoch": 2.543978349120433,
+      "grad_norm": 1.0271503925323486,
+      "learning_rate": 0.0002,
+      "loss": 0.7422,
+      "step": 1880
+    },
+    {
+      "epoch": 2.557510148849797,
+      "grad_norm": 1.2724764347076416,
+      "learning_rate": 0.0002,
+      "loss": 0.7513,
+      "step": 1890
+    },
+    {
+      "epoch": 2.571041948579161,
+      "grad_norm": 0.9063859581947327,
+      "learning_rate": 0.0002,
+      "loss": 0.7594,
+      "step": 1900
+    },
+    {
+      "epoch": 2.584573748308525,
+      "grad_norm": 0.9433910250663757,
+      "learning_rate": 0.0002,
+      "loss": 0.7423,
+      "step": 1910
+    },
+    {
+      "epoch": 2.598105548037889,
+      "grad_norm": 0.8303482532501221,
+      "learning_rate": 0.0002,
+      "loss": 0.6833,
+      "step": 1920
+    },
+    {
+      "epoch": 2.611637347767253,
+      "grad_norm": 1.16862952709198,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1930
+    },
+    {
+      "epoch": 2.6251691474966172,
+      "grad_norm": 0.8904703855514526,
+      "learning_rate": 0.0002,
+      "loss": 0.7544,
+      "step": 1940
+    },
+    {
+      "epoch": 2.6387009472259813,
+      "grad_norm": 1.2958505153656006,
+      "learning_rate": 0.0002,
+      "loss": 0.7339,
+      "step": 1950
+    },
+    {
+      "epoch": 2.6522327469553453,
+      "grad_norm": 1.2310389280319214,
+      "learning_rate": 0.0002,
+      "loss": 0.7502,
+      "step": 1960
+    },
+    {
+      "epoch": 2.6657645466847093,
+      "grad_norm": 1.3157947063446045,
+      "learning_rate": 0.0002,
+      "loss": 0.8305,
+      "step": 1970
+    },
+    {
+      "epoch": 2.6792963464140733,
+      "grad_norm": 0.9247841238975525,
+      "learning_rate": 0.0002,
+      "loss": 0.7348,
+      "step": 1980
+    },
+    {
+      "epoch": 2.6928281461434374,
+      "grad_norm": 0.9850119352340698,
+      "learning_rate": 0.0002,
+      "loss": 0.7352,
+      "step": 1990
+    },
+    {
+      "epoch": 2.706359945872801,
+      "grad_norm": 1.59624183177948,
+      "learning_rate": 0.0002,
+      "loss": 0.7794,
+      "step": 2000
+    },
+    {
+      "epoch": 2.719891745602165,
+      "grad_norm": 1.791932225227356,
+      "learning_rate": 0.0002,
+      "loss": 0.6918,
+      "step": 2010
+    },
+    {
+      "epoch": 2.733423545331529,
+      "grad_norm": 0.8530828356742859,
+      "learning_rate": 0.0002,
+      "loss": 0.7616,
+      "step": 2020
+    },
+    {
+      "epoch": 2.746955345060893,
+      "grad_norm": 1.431843638420105,
+      "learning_rate": 0.0002,
+      "loss": 0.8028,
+      "step": 2030
+    },
+    {
+      "epoch": 2.760487144790257,
+      "grad_norm": 1.1818324327468872,
+      "learning_rate": 0.0002,
+      "loss": 0.7403,
+      "step": 2040
+    },
+    {
+      "epoch": 2.774018944519621,
+      "grad_norm": 1.0456408262252808,
+      "learning_rate": 0.0002,
+      "loss": 0.6752,
+      "step": 2050
+    },
+    {
+      "epoch": 2.787550744248985,
+      "grad_norm": 1.5935403108596802,
+      "learning_rate": 0.0002,
+      "loss": 0.7771,
+      "step": 2060
+    },
+    {
+      "epoch": 2.801082543978349,
+      "grad_norm": 1.6653326749801636,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 2070
+    },
+    {
+      "epoch": 2.814614343707713,
+      "grad_norm": 1.2409698963165283,
+      "learning_rate": 0.0002,
+      "loss": 0.7553,
+      "step": 2080
+    },
+    {
+      "epoch": 2.828146143437077,
+      "grad_norm": 0.8511452674865723,
+      "learning_rate": 0.0002,
+      "loss": 0.7483,
+      "step": 2090
+    },
+    {
+      "epoch": 2.841677943166441,
+      "grad_norm": 1.1064083576202393,
+      "learning_rate": 0.0002,
+      "loss": 0.7773,
+      "step": 2100
+    },
+    {
+      "epoch": 2.855209742895805,
+      "grad_norm": 1.5252450704574585,
+      "learning_rate": 0.0002,
+      "loss": 0.8025,
+      "step": 2110
+    },
+    {
+      "epoch": 2.8687415426251692,
+      "grad_norm": 1.8477630615234375,
+      "learning_rate": 0.0002,
+      "loss": 0.7342,
+      "step": 2120
+    },
+    {
+      "epoch": 2.8822733423545333,
+      "grad_norm": 1.8251630067825317,
+      "learning_rate": 0.0002,
+      "loss": 1.0005,
+      "step": 2130
+    },
+    {
+      "epoch": 2.8958051420838973,
+      "grad_norm": 2.0696771144866943,
+      "learning_rate": 0.0002,
+      "loss": 0.8036,
+      "step": 2140
+    },
+    {
+      "epoch": 2.9093369418132613,
+      "grad_norm": 1.0875508785247803,
+      "learning_rate": 0.0002,
+      "loss": 0.7561,
+      "step": 2150
+    },
+    {
+      "epoch": 2.9228687415426253,
+      "grad_norm": 1.1486080884933472,
+      "learning_rate": 0.0002,
+      "loss": 0.808,
+      "step": 2160
+    },
+    {
+      "epoch": 2.936400541271989,
+      "grad_norm": 1.3913694620132446,
+      "learning_rate": 0.0002,
+      "loss": 0.7418,
+      "step": 2170
+    },
+    {
+      "epoch": 2.949932341001353,
+      "grad_norm": 1.0237643718719482,
+      "learning_rate": 0.0002,
+      "loss": 0.7829,
+      "step": 2180
+    },
+    {
+      "epoch": 2.963464140730717,
+      "grad_norm": 1.0060926675796509,
+      "learning_rate": 0.0002,
+      "loss": 0.8717,
+      "step": 2190
+    },
+    {
+      "epoch": 2.976995940460081,
+      "grad_norm": 0.8395462036132812,
+      "learning_rate": 0.0002,
+      "loss": 0.7581,
+      "step": 2200
+    },
+    {
+      "epoch": 2.990527740189445,
+      "grad_norm": 1.4483158588409424,
+      "learning_rate": 0.0002,
+      "loss": 0.7032,
+      "step": 2210
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.3191189765930176,
+      "eval_runtime": 53.5323,
+      "eval_samples_per_second": 8.518,
+      "eval_steps_per_second": 1.065,
+      "step": 2217
+    },
+    {
+      "epoch": 3.004059539918809,
+      "grad_norm": 0.9717937111854553,
+      "learning_rate": 0.0002,
+      "loss": 0.6775,
+      "step": 2220
+    },
+    {
+      "epoch": 3.017591339648173,
+      "grad_norm": 1.2057876586914062,
+      "learning_rate": 0.0002,
+      "loss": 0.6059,
+      "step": 2230
+    },
+    {
+      "epoch": 3.031123139377537,
+      "grad_norm": 1.2295159101486206,
+      "learning_rate": 0.0002,
+      "loss": 0.5764,
+      "step": 2240
+    },
+    {
+      "epoch": 3.044654939106901,
+      "grad_norm": 1.1200335025787354,
+      "learning_rate": 0.0002,
+      "loss": 0.5088,
+      "step": 2250
+    },
+    {
+      "epoch": 3.058186738836265,
+      "grad_norm": 1.3113594055175781,
+      "learning_rate": 0.0002,
+      "loss": 0.5464,
+      "step": 2260
+    },
+    {
+      "epoch": 3.071718538565629,
+      "grad_norm": 1.3074201345443726,
+      "learning_rate": 0.0002,
+      "loss": 0.5959,
+      "step": 2270
+    },
+    {
+      "epoch": 3.085250338294993,
+      "grad_norm": 1.7636418342590332,
+      "learning_rate": 0.0002,
+      "loss": 0.642,
+      "step": 2280
+    },
+    {
+      "epoch": 3.098782138024357,
+      "grad_norm": 1.2225017547607422,
+      "learning_rate": 0.0002,
+      "loss": 0.5645,
+      "step": 2290
+    },
+    {
+      "epoch": 3.1123139377537212,
+      "grad_norm": 1.062538743019104,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 2300
+    },
+    {
+      "epoch": 3.1258457374830853,
+      "grad_norm": 1.9475018978118896,
+      "learning_rate": 0.0002,
+      "loss": 0.5426,
+      "step": 2310
+    },
+    {
+      "epoch": 3.1393775372124493,
+      "grad_norm": 1.3695366382598877,
+      "learning_rate": 0.0002,
+      "loss": 0.5028,
+      "step": 2320
+    },
+    {
+      "epoch": 3.1529093369418133,
+      "grad_norm": 1.4610179662704468,
+      "learning_rate": 0.0002,
+      "loss": 0.6278,
+      "step": 2330
+    },
+    {
+      "epoch": 3.1664411366711773,
+      "grad_norm": 1.1319258213043213,
+      "learning_rate": 0.0002,
+      "loss": 0.6062,
+      "step": 2340
+    },
+    {
+      "epoch": 3.1799729364005414,
+      "grad_norm": 1.8418315649032593,
+      "learning_rate": 0.0002,
+      "loss": 0.5946,
+      "step": 2350
+    },
+    {
+      "epoch": 3.1935047361299054,
+      "grad_norm": 1.0682015419006348,
+      "learning_rate": 0.0002,
+      "loss": 0.6215,
+      "step": 2360
+    },
+    {
+      "epoch": 3.2070365358592694,
+      "grad_norm": 0.9852792024612427,
+      "learning_rate": 0.0002,
+      "loss": 0.5431,
+      "step": 2370
+    },
+    {
+      "epoch": 3.2205683355886334,
+      "grad_norm": 1.447991967201233,
+      "learning_rate": 0.0002,
+      "loss": 0.561,
+      "step": 2380
+    },
+    {
+      "epoch": 3.2341001353179974,
+      "grad_norm": 1.3113367557525635,
+      "learning_rate": 0.0002,
+      "loss": 0.6143,
+      "step": 2390
+    },
+    {
+      "epoch": 3.2476319350473615,
+      "grad_norm": 1.412656307220459,
+      "learning_rate": 0.0002,
+      "loss": 0.6268,
+      "step": 2400
+    },
+    {
+      "epoch": 3.2611637347767255,
+      "grad_norm": 1.41526198387146,
+      "learning_rate": 0.0002,
+      "loss": 0.5883,
+      "step": 2410
+    },
+    {
+      "epoch": 3.2746955345060895,
+      "grad_norm": 1.5622785091400146,
+      "learning_rate": 0.0002,
+      "loss": 0.5242,
+      "step": 2420
+    },
+    {
+      "epoch": 3.2882273342354535,
+      "grad_norm": 1.6155788898468018,
+      "learning_rate": 0.0002,
+      "loss": 0.5536,
+      "step": 2430
+    },
+    {
+      "epoch": 3.301759133964817,
+      "grad_norm": 1.4699913263320923,
+      "learning_rate": 0.0002,
+      "loss": 0.5464,
+      "step": 2440
+    },
+    {
+      "epoch": 3.315290933694181,
+      "grad_norm": 1.0095789432525635,
+      "learning_rate": 0.0002,
+      "loss": 0.6074,
+      "step": 2450
+    },
+    {
+      "epoch": 3.328822733423545,
+      "grad_norm": 1.620950698852539,
+      "learning_rate": 0.0002,
+      "loss": 0.5316,
+      "step": 2460
+    },
+    {
+      "epoch": 3.342354533152909,
+      "grad_norm": 1.4491326808929443,
+      "learning_rate": 0.0002,
+      "loss": 0.6617,
+      "step": 2470
+    },
+    {
+      "epoch": 3.3558863328822732,
+      "grad_norm": 1.9128118753433228,
+      "learning_rate": 0.0002,
+      "loss": 0.5639,
+      "step": 2480
+    },
+    {
+      "epoch": 3.3694181326116373,
+      "grad_norm": 1.36688232421875,
+      "learning_rate": 0.0002,
+      "loss": 0.5958,
+      "step": 2490
+    },
+    {
+      "epoch": 3.3829499323410013,
+      "grad_norm": 1.455443263053894,
+      "learning_rate": 0.0002,
+      "loss": 0.5432,
+      "step": 2500
+    },
+    {
+      "epoch": 3.3964817320703653,
+      "grad_norm": 1.2894777059555054,
+      "learning_rate": 0.0002,
+      "loss": 0.6697,
+      "step": 2510
+    },
+    {
+      "epoch": 3.4100135317997293,
+      "grad_norm": 1.3889403343200684,
+      "learning_rate": 0.0002,
+      "loss": 0.5893,
+      "step": 2520
+    },
+    {
+      "epoch": 3.4235453315290933,
+      "grad_norm": 1.4315358400344849,
+      "learning_rate": 0.0002,
+      "loss": 0.5148,
+      "step": 2530
+    },
+    {
+      "epoch": 3.4370771312584574,
+      "grad_norm": 1.3308886289596558,
+      "learning_rate": 0.0002,
+      "loss": 0.5766,
+      "step": 2540
+    },
+    {
+      "epoch": 3.4506089309878214,
+      "grad_norm": 1.2735179662704468,
+      "learning_rate": 0.0002,
+      "loss": 0.6168,
+      "step": 2550
+    },
+    {
+      "epoch": 3.4641407307171854,
+      "grad_norm": 1.2731887102127075,
+      "learning_rate": 0.0002,
+      "loss": 0.6133,
+      "step": 2560
+    },
+    {
+      "epoch": 3.4776725304465494,
+      "grad_norm": 2.390596628189087,
+      "learning_rate": 0.0002,
+      "loss": 0.5956,
+      "step": 2570
+    },
+    {
+      "epoch": 3.4912043301759135,
+      "grad_norm": 1.3651424646377563,
+      "learning_rate": 0.0002,
+      "loss": 0.576,
+      "step": 2580
+    },
+    {
+      "epoch": 3.5047361299052775,
+      "grad_norm": 0.9903562068939209,
+      "learning_rate": 0.0002,
+      "loss": 0.5456,
+      "step": 2590
+    },
+    {
+      "epoch": 3.5182679296346415,
+      "grad_norm": 1.467106580734253,
+      "learning_rate": 0.0002,
+      "loss": 0.6323,
+      "step": 2600
+    },
+    {
+      "epoch": 3.5317997293640055,
+      "grad_norm": 1.4800456762313843,
+      "learning_rate": 0.0002,
+      "loss": 0.6195,
+      "step": 2610
+    },
+    {
+      "epoch": 3.5453315290933696,
+      "grad_norm": 1.140714406967163,
+      "learning_rate": 0.0002,
+      "loss": 0.5971,
+      "step": 2620
+    },
+    {
+      "epoch": 3.558863328822733,
+      "grad_norm": 2.1062142848968506,
+      "learning_rate": 0.0002,
+      "loss": 0.6463,
+      "step": 2630
+    },
+    {
+      "epoch": 3.572395128552097,
+      "grad_norm": 1.3074438571929932,
+      "learning_rate": 0.0002,
+      "loss": 0.5909,
+      "step": 2640
+    },
+    {
+      "epoch": 3.585926928281461,
+      "grad_norm": 1.80443274974823,
+      "learning_rate": 0.0002,
+      "loss": 0.6123,
+      "step": 2650
+    },
+    {
+      "epoch": 3.5994587280108252,
+      "grad_norm": 1.0620969533920288,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 2660
+    },
+    {
+      "epoch": 3.6129905277401893,
+      "grad_norm": 1.3793504238128662,
+      "learning_rate": 0.0002,
+      "loss": 0.6084,
+      "step": 2670
+    },
+    {
+      "epoch": 3.6265223274695533,
+      "grad_norm": 1.0759015083312988,
+      "learning_rate": 0.0002,
+      "loss": 0.615,
+      "step": 2680
+    },
+    {
+      "epoch": 3.6400541271989173,
+      "grad_norm": 1.5374208688735962,
+      "learning_rate": 0.0002,
+      "loss": 0.6319,
+      "step": 2690
+    },
+    {
+      "epoch": 3.6535859269282813,
+      "grad_norm": 1.690587043762207,
+      "learning_rate": 0.0002,
+      "loss": 0.5328,
+      "step": 2700
+    },
+    {
+      "epoch": 3.6671177266576453,
+      "grad_norm": 1.2092949151992798,
+      "learning_rate": 0.0002,
+      "loss": 0.5715,
+      "step": 2710
+    },
+    {
+      "epoch": 3.6806495263870094,
+      "grad_norm": 1.8789589405059814,
+      "learning_rate": 0.0002,
+      "loss": 0.6063,
+      "step": 2720
+    },
+    {
+      "epoch": 3.6941813261163734,
+      "grad_norm": 1.5840286016464233,
+      "learning_rate": 0.0002,
+      "loss": 0.5823,
+      "step": 2730
+    },
+    {
+      "epoch": 3.7077131258457374,
+      "grad_norm": 1.3318506479263306,
+      "learning_rate": 0.0002,
+      "loss": 0.6039,
+      "step": 2740
+    },
+    {
+      "epoch": 3.7212449255751014,
+      "grad_norm": 1.0107663869857788,
+      "learning_rate": 0.0002,
+      "loss": 0.6488,
+      "step": 2750
+    },
+    {
+      "epoch": 3.7347767253044655,
+      "grad_norm": 1.152219295501709,
+      "learning_rate": 0.0002,
+      "loss": 0.6657,
+      "step": 2760
+    },
+    {
+      "epoch": 3.7483085250338295,
+      "grad_norm": 1.4025444984436035,
+      "learning_rate": 0.0002,
+      "loss": 0.5845,
+      "step": 2770
+    },
+    {
+      "epoch": 3.7618403247631935,
+      "grad_norm": 0.9559378623962402,
+      "learning_rate": 0.0002,
+      "loss": 0.6215,
+      "step": 2780
+    },
+    {
+      "epoch": 3.7753721244925575,
+      "grad_norm": 1.196541428565979,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 2790
+    },
+    {
+      "epoch": 3.7889039242219216,
+      "grad_norm": 1.0485719442367554,
+      "learning_rate": 0.0002,
+      "loss": 0.5686,
+      "step": 2800
+    },
+    {
+      "epoch": 3.8024357239512856,
+      "grad_norm": 1.3199235200881958,
+      "learning_rate": 0.0002,
+      "loss": 0.557,
+      "step": 2810
+    },
+    {
+      "epoch": 3.8159675236806496,
+      "grad_norm": 1.8519755601882935,
+      "learning_rate": 0.0002,
+      "loss": 0.5797,
+      "step": 2820
+    },
+    {
+      "epoch": 3.8294993234100136,
+      "grad_norm": 1.3234314918518066,
+      "learning_rate": 0.0002,
+      "loss": 0.6194,
+      "step": 2830
+    },
+    {
+      "epoch": 3.8430311231393777,
+      "grad_norm": 1.5337995290756226,
+      "learning_rate": 0.0002,
+      "loss": 0.6192,
+      "step": 2840
+    },
+    {
+      "epoch": 3.8565629228687417,
+      "grad_norm": 1.3527625799179077,
+      "learning_rate": 0.0002,
+      "loss": 0.5753,
+      "step": 2850
+    },
+    {
+      "epoch": 3.8700947225981057,
+      "grad_norm": 1.1479119062423706,
+      "learning_rate": 0.0002,
+      "loss": 0.6199,
+      "step": 2860
+    },
+    {
+      "epoch": 3.8836265223274697,
+      "grad_norm": 1.8172897100448608,
+      "learning_rate": 0.0002,
+      "loss": 0.5928,
+      "step": 2870
+    },
+    {
+      "epoch": 3.8971583220568338,
+      "grad_norm": 2.219006061553955,
+      "learning_rate": 0.0002,
+      "loss": 0.6224,
+      "step": 2880
+    },
+    {
+      "epoch": 3.910690121786198,
+      "grad_norm": 1.1499899625778198,
+      "learning_rate": 0.0002,
+      "loss": 0.6196,
+      "step": 2890
+    },
+    {
+      "epoch": 3.924221921515562,
+      "grad_norm": 1.2255879640579224,
+      "learning_rate": 0.0002,
+      "loss": 0.6052,
+      "step": 2900
+    },
+    {
+      "epoch": 3.937753721244926,
+      "grad_norm": 1.3766648769378662,
+      "learning_rate": 0.0002,
+      "loss": 0.6153,
+      "step": 2910
+    },
+    {
+      "epoch": 3.9512855209742894,
+      "grad_norm": 1.5438952445983887,
+      "learning_rate": 0.0002,
+      "loss": 0.5182,
+      "step": 2920
+    },
+    {
+      "epoch": 3.9648173207036534,
+      "grad_norm": 1.904476523399353,
+      "learning_rate": 0.0002,
+      "loss": 0.5805,
+      "step": 2930
+    },
+    {
+      "epoch": 3.9783491204330175,
+      "grad_norm": 1.5277420282363892,
+      "learning_rate": 0.0002,
+      "loss": 0.6441,
+      "step": 2940
+    },
+    {
+      "epoch": 3.9918809201623815,
+      "grad_norm": 1.4065558910369873,
+      "learning_rate": 0.0002,
+      "loss": 0.6155,
+      "step": 2950
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.4007929563522339,
+      "eval_runtime": 46.5624,
+      "eval_samples_per_second": 9.793,
+      "eval_steps_per_second": 1.224,
+      "step": 2956
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5912,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.296936921464832e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c0e60ee7ec9d35429ff5330b6a72628063da6a9c843e01fa8c503c2ba1303f
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3489c614d6dd211bd1e168d19735e2507dbf45c7792f02ff4815f545144c2a41
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b27ed2d6137700f94216af8a0770a00b46ba018b597e97d5dbbbad00d904a1e
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:413ce3773a0b8e8b94dba01c285394b42fbf629c07c7a96a879547b4da998981
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b21951b8649a073ab5803e4e2fb752c8c608bbfe4280419534497e131334755d
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2656 @@

+{
+  "best_metric": 1.28184974193573,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-10000/checkpoint-1478",
+  "epoch": 5.0,
+  "eval_steps": 10,
+  "global_step": 3695,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013531799729364006,
+      "grad_norm": 1.1050695180892944,
+      "learning_rate": 0.0002,
+      "loss": 1.9104,
+      "step": 10
+    },
+    {
+      "epoch": 0.02706359945872801,
+      "grad_norm": 1.2463239431381226,
+      "learning_rate": 0.0002,
+      "loss": 1.6832,
+      "step": 20
+    },
+    {
+      "epoch": 0.04059539918809202,
+      "grad_norm": 1.193475604057312,
+      "learning_rate": 0.0002,
+      "loss": 1.4273,
+      "step": 30
+    },
+    {
+      "epoch": 0.05412719891745602,
+      "grad_norm": 0.7777265310287476,
+      "learning_rate": 0.0002,
+      "loss": 1.4369,
+      "step": 40
+    },
+    {
+      "epoch": 0.06765899864682003,
+      "grad_norm": 1.4079619646072388,
+      "learning_rate": 0.0002,
+      "loss": 1.3618,
+      "step": 50
+    },
+    {
+      "epoch": 0.08119079837618404,
+      "grad_norm": 0.9451370239257812,
+      "learning_rate": 0.0002,
+      "loss": 1.2639,
+      "step": 60
+    },
+    {
+      "epoch": 0.09472259810554803,
+      "grad_norm": 0.7137989401817322,
+      "learning_rate": 0.0002,
+      "loss": 1.2494,
+      "step": 70
+    },
+    {
+      "epoch": 0.10825439783491204,
+      "grad_norm": 0.5521688461303711,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 80
+    },
+    {
+      "epoch": 0.12178619756427606,
+      "grad_norm": 0.8050252795219421,
+      "learning_rate": 0.0002,
+      "loss": 1.2882,
+      "step": 90
+    },
+    {
+      "epoch": 0.13531799729364005,
+      "grad_norm": 0.6771517992019653,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 100
+    },
+    {
+      "epoch": 0.14884979702300405,
+      "grad_norm": 0.8056462407112122,
+      "learning_rate": 0.0002,
+      "loss": 1.2265,
+      "step": 110
+    },
+    {
+      "epoch": 0.16238159675236807,
+      "grad_norm": 0.7240928411483765,
+      "learning_rate": 0.0002,
+      "loss": 1.3218,
+      "step": 120
+    },
+    {
+      "epoch": 0.17591339648173207,
+      "grad_norm": 0.5310224294662476,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 130
+    },
+    {
+      "epoch": 0.18944519621109607,
+      "grad_norm": 0.5730571150779724,
+      "learning_rate": 0.0002,
+      "loss": 1.2196,
+      "step": 140
+    },
+    {
+      "epoch": 0.2029769959404601,
+      "grad_norm": 0.5288769006729126,
+      "learning_rate": 0.0002,
+      "loss": 1.3183,
+      "step": 150
+    },
+    {
+      "epoch": 0.2165087956698241,
+      "grad_norm": 0.5447443723678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1886,
+      "step": 160
+    },
+    {
+      "epoch": 0.23004059539918809,
+      "grad_norm": 2.2740917205810547,
+      "learning_rate": 0.0002,
+      "loss": 1.1994,
+      "step": 170
+    },
+    {
+      "epoch": 0.2435723951285521,
+      "grad_norm": 0.6200950741767883,
+      "learning_rate": 0.0002,
+      "loss": 1.2398,
+      "step": 180
+    },
+    {
+      "epoch": 0.2571041948579161,
+      "grad_norm": 1.1782610416412354,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 190
+    },
+    {
+      "epoch": 0.2706359945872801,
+      "grad_norm": 0.7893068194389343,
+      "learning_rate": 0.0002,
+      "loss": 1.1697,
+      "step": 200
+    },
+    {
+      "epoch": 0.28416779431664413,
+      "grad_norm": 0.5421761274337769,
+      "learning_rate": 0.0002,
+      "loss": 1.221,
+      "step": 210
+    },
+    {
+      "epoch": 0.2976995940460081,
+      "grad_norm": 0.5460169315338135,
+      "learning_rate": 0.0002,
+      "loss": 1.2019,
+      "step": 220
+    },
+    {
+      "epoch": 0.3112313937753721,
+      "grad_norm": 0.5957289934158325,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 230
+    },
+    {
+      "epoch": 0.32476319350473615,
+      "grad_norm": 0.745093047618866,
+      "learning_rate": 0.0002,
+      "loss": 1.2545,
+      "step": 240
+    },
+    {
+      "epoch": 0.3382949932341001,
+      "grad_norm": 0.622627317905426,
+      "learning_rate": 0.0002,
+      "loss": 1.2408,
+      "step": 250
+    },
+    {
+      "epoch": 0.35182679296346414,
+      "grad_norm": 0.5773138999938965,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 260
+    },
+    {
+      "epoch": 0.36535859269282817,
+      "grad_norm": 1.104275107383728,
+      "learning_rate": 0.0002,
+      "loss": 1.0475,
+      "step": 270
+    },
+    {
+      "epoch": 0.37889039242219213,
+      "grad_norm": 0.5755344033241272,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 280
+    },
+    {
+      "epoch": 0.39242219215155616,
+      "grad_norm": 0.5885311961174011,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 290
+    },
+    {
+      "epoch": 0.4059539918809202,
+      "grad_norm": 1.448182225227356,
+      "learning_rate": 0.0002,
+      "loss": 1.2099,
+      "step": 300
+    },
+    {
+      "epoch": 0.41948579161028415,
+      "grad_norm": 0.5983599424362183,
+      "learning_rate": 0.0002,
+      "loss": 1.2114,
+      "step": 310
+    },
+    {
+      "epoch": 0.4330175913396482,
+      "grad_norm": 0.5013539791107178,
+      "learning_rate": 0.0002,
+      "loss": 1.1691,
+      "step": 320
+    },
+    {
+      "epoch": 0.4465493910690122,
+      "grad_norm": 0.8935738205909729,
+      "learning_rate": 0.0002,
+      "loss": 1.138,
+      "step": 330
+    },
+    {
+      "epoch": 0.46008119079837617,
+      "grad_norm": 0.5642115473747253,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 340
+    },
+    {
+      "epoch": 0.4736129905277402,
+      "grad_norm": 0.7897255420684814,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 350
+    },
+    {
+      "epoch": 0.4871447902571042,
+      "grad_norm": 1.6891459226608276,
+      "learning_rate": 0.0002,
+      "loss": 1.1067,
+      "step": 360
+    },
+    {
+      "epoch": 0.5006765899864682,
+      "grad_norm": 1.1374807357788086,
+      "learning_rate": 0.0002,
+      "loss": 1.1787,
+      "step": 370
+    },
+    {
+      "epoch": 0.5142083897158322,
+      "grad_norm": 0.5355549454689026,
+      "learning_rate": 0.0002,
+      "loss": 1.1525,
+      "step": 380
+    },
+    {
+      "epoch": 0.5277401894451962,
+      "grad_norm": 0.656196653842926,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 390
+    },
+    {
+      "epoch": 0.5412719891745602,
+      "grad_norm": 0.8692356944084167,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 400
+    },
+    {
+      "epoch": 0.5548037889039242,
+      "grad_norm": 0.5873697400093079,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 410
+    },
+    {
+      "epoch": 0.5683355886332883,
+      "grad_norm": 0.8922758102416992,
+      "learning_rate": 0.0002,
+      "loss": 1.1841,
+      "step": 420
+    },
+    {
+      "epoch": 0.5818673883626523,
+      "grad_norm": 0.5048012733459473,
+      "learning_rate": 0.0002,
+      "loss": 1.184,
+      "step": 430
+    },
+    {
+      "epoch": 0.5953991880920162,
+      "grad_norm": 0.603631317615509,
+      "learning_rate": 0.0002,
+      "loss": 1.3276,
+      "step": 440
+    },
+    {
+      "epoch": 0.6089309878213802,
+      "grad_norm": 0.5635734796524048,
+      "learning_rate": 0.0002,
+      "loss": 1.0459,
+      "step": 450
+    },
+    {
+      "epoch": 0.6224627875507442,
+      "grad_norm": 0.7483186721801758,
+      "learning_rate": 0.0002,
+      "loss": 1.1549,
+      "step": 460
+    },
+    {
+      "epoch": 0.6359945872801083,
+      "grad_norm": 0.5372456312179565,
+      "learning_rate": 0.0002,
+      "loss": 1.1141,
+      "step": 470
+    },
+    {
+      "epoch": 0.6495263870094723,
+      "grad_norm": 0.9461246728897095,
+      "learning_rate": 0.0002,
+      "loss": 1.2966,
+      "step": 480
+    },
+    {
+      "epoch": 0.6630581867388363,
+      "grad_norm": 0.7017379403114319,
+      "learning_rate": 0.0002,
+      "loss": 1.1226,
+      "step": 490
+    },
+    {
+      "epoch": 0.6765899864682002,
+      "grad_norm": 1.6177887916564941,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 500
+    },
+    {
+      "epoch": 0.6901217861975643,
+      "grad_norm": 0.4857328236103058,
+      "learning_rate": 0.0002,
+      "loss": 1.0716,
+      "step": 510
+    },
+    {
+      "epoch": 0.7036535859269283,
+      "grad_norm": 1.0545706748962402,
+      "learning_rate": 0.0002,
+      "loss": 1.2898,
+      "step": 520
+    },
+    {
+      "epoch": 0.7171853856562923,
+      "grad_norm": 0.7486541867256165,
+      "learning_rate": 0.0002,
+      "loss": 1.2527,
+      "step": 530
+    },
+    {
+      "epoch": 0.7307171853856563,
+      "grad_norm": 0.6588427424430847,
+      "learning_rate": 0.0002,
+      "loss": 1.0634,
+      "step": 540
+    },
+    {
+      "epoch": 0.7442489851150202,
+      "grad_norm": 0.9485914707183838,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 550
+    },
+    {
+      "epoch": 0.7577807848443843,
+      "grad_norm": 0.7465947866439819,
+      "learning_rate": 0.0002,
+      "loss": 1.1933,
+      "step": 560
+    },
+    {
+      "epoch": 0.7713125845737483,
+      "grad_norm": 0.6392837166786194,
+      "learning_rate": 0.0002,
+      "loss": 1.0927,
+      "step": 570
+    },
+    {
+      "epoch": 0.7848443843031123,
+      "grad_norm": 0.4898282587528229,
+      "learning_rate": 0.0002,
+      "loss": 1.14,
+      "step": 580
+    },
+    {
+      "epoch": 0.7983761840324763,
+      "grad_norm": 0.5636171102523804,
+      "learning_rate": 0.0002,
+      "loss": 1.0425,
+      "step": 590
+    },
+    {
+      "epoch": 0.8119079837618404,
+      "grad_norm": 0.6637675166130066,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 600
+    },
+    {
+      "epoch": 0.8254397834912043,
+      "grad_norm": 1.1842738389968872,
+      "learning_rate": 0.0002,
+      "loss": 1.1204,
+      "step": 610
+    },
+    {
+      "epoch": 0.8389715832205683,
+      "grad_norm": 0.5699004530906677,
+      "learning_rate": 0.0002,
+      "loss": 1.083,
+      "step": 620
+    },
+    {
+      "epoch": 0.8525033829499323,
+      "grad_norm": 0.7748669385910034,
+      "learning_rate": 0.0002,
+      "loss": 1.153,
+      "step": 630
+    },
+    {
+      "epoch": 0.8660351826792964,
+      "grad_norm": 0.7987180352210999,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 640
+    },
+    {
+      "epoch": 0.8795669824086604,
+      "grad_norm": 1.0740629434585571,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 650
+    },
+    {
+      "epoch": 0.8930987821380244,
+      "grad_norm": 0.731082022190094,
+      "learning_rate": 0.0002,
+      "loss": 1.0503,
+      "step": 660
+    },
+    {
+      "epoch": 0.9066305818673883,
+      "grad_norm": 0.9066846966743469,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 670
+    },
+    {
+      "epoch": 0.9201623815967523,
+      "grad_norm": 0.9934597015380859,
+      "learning_rate": 0.0002,
+      "loss": 1.0516,
+      "step": 680
+    },
+    {
+      "epoch": 0.9336941813261164,
+      "grad_norm": 0.7975896000862122,
+      "learning_rate": 0.0002,
+      "loss": 1.0983,
+      "step": 690
+    },
+    {
+      "epoch": 0.9472259810554804,
+      "grad_norm": 0.9127744436264038,
+      "learning_rate": 0.0002,
+      "loss": 1.2362,
+      "step": 700
+    },
+    {
+      "epoch": 0.9607577807848444,
+      "grad_norm": 0.7682064175605774,
+      "learning_rate": 0.0002,
+      "loss": 1.0273,
+      "step": 710
+    },
+    {
+      "epoch": 0.9742895805142084,
+      "grad_norm": 0.9808696508407593,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 720
+    },
+    {
+      "epoch": 0.9878213802435724,
+      "grad_norm": 1.0826992988586426,
+      "learning_rate": 0.0002,
+      "loss": 1.0979,
+      "step": 730
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2867646217346191,
+      "eval_runtime": 53.6569,
+      "eval_samples_per_second": 8.498,
+      "eval_steps_per_second": 1.062,
+      "step": 739
+    },
+    {
+      "epoch": 1.0013531799729365,
+      "grad_norm": 0.6498591303825378,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 740
+    },
+    {
+      "epoch": 1.0148849797023005,
+      "grad_norm": 0.8040738701820374,
+      "learning_rate": 0.0002,
+      "loss": 1.0737,
+      "step": 750
+    },
+    {
+      "epoch": 1.0284167794316643,
+      "grad_norm": 0.7280412912368774,
+      "learning_rate": 0.0002,
+      "loss": 1.03,
+      "step": 760
+    },
+    {
+      "epoch": 1.0419485791610283,
+      "grad_norm": 0.672149121761322,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 770
+    },
+    {
+      "epoch": 1.0554803788903924,
+      "grad_norm": 1.1186308860778809,
+      "learning_rate": 0.0002,
+      "loss": 0.997,
+      "step": 780
+    },
+    {
+      "epoch": 1.0690121786197564,
+      "grad_norm": 0.9073583483695984,
+      "learning_rate": 0.0002,
+      "loss": 0.9088,
+      "step": 790
+    },
+    {
+      "epoch": 1.0825439783491204,
+      "grad_norm": 0.6135605573654175,
+      "learning_rate": 0.0002,
+      "loss": 0.9413,
+      "step": 800
+    },
+    {
+      "epoch": 1.0960757780784844,
+      "grad_norm": 0.5854787826538086,
+      "learning_rate": 0.0002,
+      "loss": 0.9024,
+      "step": 810
+    },
+    {
+      "epoch": 1.1096075778078485,
+      "grad_norm": 0.9077727794647217,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 820
+    },
+    {
+      "epoch": 1.1231393775372125,
+      "grad_norm": 0.7072564363479614,
+      "learning_rate": 0.0002,
+      "loss": 0.9489,
+      "step": 830
+    },
+    {
+      "epoch": 1.1366711772665765,
+      "grad_norm": 0.9457924365997314,
+      "learning_rate": 0.0002,
+      "loss": 0.9275,
+      "step": 840
+    },
+    {
+      "epoch": 1.1502029769959405,
+      "grad_norm": 0.9216122031211853,
+      "learning_rate": 0.0002,
+      "loss": 0.9998,
+      "step": 850
+    },
+    {
+      "epoch": 1.1637347767253043,
+      "grad_norm": 1.0899791717529297,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 860
+    },
+    {
+      "epoch": 1.1772665764546684,
+      "grad_norm": 0.8594662547111511,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 870
+    },
+    {
+      "epoch": 1.1907983761840324,
+      "grad_norm": 0.8680914640426636,
+      "learning_rate": 0.0002,
+      "loss": 0.9513,
+      "step": 880
+    },
+    {
+      "epoch": 1.2043301759133964,
+      "grad_norm": 0.5579341650009155,
+      "learning_rate": 0.0002,
+      "loss": 0.9695,
+      "step": 890
+    },
+    {
+      "epoch": 1.2178619756427604,
+      "grad_norm": 0.8556986451148987,
+      "learning_rate": 0.0002,
+      "loss": 1.0153,
+      "step": 900
+    },
+    {
+      "epoch": 1.2313937753721245,
+      "grad_norm": 1.8943263292312622,
+      "learning_rate": 0.0002,
+      "loss": 0.9589,
+      "step": 910
+    },
+    {
+      "epoch": 1.2449255751014885,
+      "grad_norm": 0.7652221918106079,
+      "learning_rate": 0.0002,
+      "loss": 0.9554,
+      "step": 920
+    },
+    {
+      "epoch": 1.2584573748308525,
+      "grad_norm": 0.6921482086181641,
+      "learning_rate": 0.0002,
+      "loss": 0.955,
+      "step": 930
+    },
+    {
+      "epoch": 1.2719891745602165,
+      "grad_norm": 0.7211646437644958,
+      "learning_rate": 0.0002,
+      "loss": 1.0335,
+      "step": 940
+    },
+    {
+      "epoch": 1.2855209742895806,
+      "grad_norm": 0.9096421599388123,
+      "learning_rate": 0.0002,
+      "loss": 1.0597,
+      "step": 950
+    },
+    {
+      "epoch": 1.2990527740189446,
+      "grad_norm": 0.743715226650238,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 960
+    },
+    {
+      "epoch": 1.3125845737483086,
+      "grad_norm": 0.9247064590454102,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 970
+    },
+    {
+      "epoch": 1.3261163734776726,
+      "grad_norm": 1.0811798572540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9534,
+      "step": 980
+    },
+    {
+      "epoch": 1.3396481732070367,
+      "grad_norm": 0.7317015528678894,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 990
+    },
+    {
+      "epoch": 1.3531799729364005,
+      "grad_norm": 0.8399309515953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0071,
+      "step": 1000
+    },
+    {
+      "epoch": 1.3667117726657645,
+      "grad_norm": 1.094558835029602,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3802435723951285,
+      "grad_norm": 1.3759856224060059,
+      "learning_rate": 0.0002,
+      "loss": 0.8744,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3937753721244925,
+      "grad_norm": 0.8855497241020203,
+      "learning_rate": 0.0002,
+      "loss": 0.915,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4073071718538566,
+      "grad_norm": 3.6836671829223633,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1040
+    },
+    {
+      "epoch": 1.4208389715832206,
+      "grad_norm": 1.1119214296340942,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1050
+    },
+    {
+      "epoch": 1.4343707713125846,
+      "grad_norm": 0.8871118426322937,
+      "learning_rate": 0.0002,
+      "loss": 0.9381,
+      "step": 1060
+    },
+    {
+      "epoch": 1.4479025710419486,
+      "grad_norm": 0.9937213063240051,
+      "learning_rate": 0.0002,
+      "loss": 0.9091,
+      "step": 1070
+    },
+    {
+      "epoch": 1.4614343707713127,
+      "grad_norm": 0.7206485867500305,
+      "learning_rate": 0.0002,
+      "loss": 0.9923,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4749661705006765,
+      "grad_norm": 0.8442404866218567,
+      "learning_rate": 0.0002,
+      "loss": 0.951,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4884979702300405,
+      "grad_norm": 0.9265049695968628,
+      "learning_rate": 0.0002,
+      "loss": 0.8609,
+      "step": 1100
+    },
+    {
+      "epoch": 1.5020297699594045,
+      "grad_norm": 1.1033650636672974,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 1110
+    },
+    {
+      "epoch": 1.5155615696887685,
+      "grad_norm": 0.7876176834106445,
+      "learning_rate": 0.0002,
+      "loss": 1.004,
+      "step": 1120
+    },
+    {
+      "epoch": 1.5290933694181326,
+      "grad_norm": 0.7761271595954895,
+      "learning_rate": 0.0002,
+      "loss": 0.9555,
+      "step": 1130
+    },
+    {
+      "epoch": 1.5426251691474966,
+      "grad_norm": 1.0603803396224976,
+      "learning_rate": 0.0002,
+      "loss": 0.9569,
+      "step": 1140
+    },
+    {
+      "epoch": 1.5561569688768606,
+      "grad_norm": 0.7715556621551514,
+      "learning_rate": 0.0002,
+      "loss": 0.9842,
+      "step": 1150
+    },
+    {
+      "epoch": 1.5696887686062246,
+      "grad_norm": 0.6591511368751526,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5832205683355887,
+      "grad_norm": 1.1773475408554077,
+      "learning_rate": 0.0002,
+      "loss": 0.9584,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5967523680649527,
+      "grad_norm": 0.8513862490653992,
+      "learning_rate": 0.0002,
+      "loss": 0.9229,
+      "step": 1180
+    },
+    {
+      "epoch": 1.6102841677943167,
+      "grad_norm": 1.0796581506729126,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1190
+    },
+    {
+      "epoch": 1.6238159675236807,
+      "grad_norm": 0.8897230625152588,
+      "learning_rate": 0.0002,
+      "loss": 0.9698,
+      "step": 1200
+    },
+    {
+      "epoch": 1.6373477672530448,
+      "grad_norm": 1.4640971422195435,
+      "learning_rate": 0.0002,
+      "loss": 0.9295,
+      "step": 1210
+    },
+    {
+      "epoch": 1.6508795669824088,
+      "grad_norm": 1.123056173324585,
+      "learning_rate": 0.0002,
+      "loss": 1.003,
+      "step": 1220
+    },
+    {
+      "epoch": 1.6644113667117728,
+      "grad_norm": 1.1064175367355347,
+      "learning_rate": 0.0002,
+      "loss": 0.9524,
+      "step": 1230
+    },
+    {
+      "epoch": 1.6779431664411368,
+      "grad_norm": 2.4434642791748047,
+      "learning_rate": 0.0002,
+      "loss": 0.8896,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6914749661705006,
+      "grad_norm": 1.0455760955810547,
+      "learning_rate": 0.0002,
+      "loss": 0.9899,
+      "step": 1250
+    },
+    {
+      "epoch": 1.7050067658998647,
+      "grad_norm": 1.1007593870162964,
+      "learning_rate": 0.0002,
+      "loss": 0.9032,
+      "step": 1260
+    },
+    {
+      "epoch": 1.7185385656292287,
+      "grad_norm": 1.2697606086730957,
+      "learning_rate": 0.0002,
+      "loss": 0.9226,
+      "step": 1270
+    },
+    {
+      "epoch": 1.7320703653585927,
+      "grad_norm": 1.1537855863571167,
+      "learning_rate": 0.0002,
+      "loss": 0.8771,
+      "step": 1280
+    },
+    {
+      "epoch": 1.7456021650879567,
+      "grad_norm": 0.9637187719345093,
+      "learning_rate": 0.0002,
+      "loss": 0.8655,
+      "step": 1290
+    },
+    {
+      "epoch": 1.7591339648173205,
+      "grad_norm": 1.1610347032546997,
+      "learning_rate": 0.0002,
+      "loss": 0.9641,
+      "step": 1300
+    },
+    {
+      "epoch": 1.7726657645466846,
+      "grad_norm": 0.717607319355011,
+      "learning_rate": 0.0002,
+      "loss": 0.9417,
+      "step": 1310
+    },
+    {
+      "epoch": 1.7861975642760486,
+      "grad_norm": 1.753371238708496,
+      "learning_rate": 0.0002,
+      "loss": 0.8852,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7997293640054126,
+      "grad_norm": 0.7919637560844421,
+      "learning_rate": 0.0002,
+      "loss": 1.0327,
+      "step": 1330
+    },
+    {
+      "epoch": 1.8132611637347766,
+      "grad_norm": 1.1091023683547974,
+      "learning_rate": 0.0002,
+      "loss": 1.0019,
+      "step": 1340
+    },
+    {
+      "epoch": 1.8267929634641407,
+      "grad_norm": 0.7157362699508667,
+      "learning_rate": 0.0002,
+      "loss": 0.9457,
+      "step": 1350
+    },
+    {
+      "epoch": 1.8403247631935047,
+      "grad_norm": 0.9538856744766235,
+      "learning_rate": 0.0002,
+      "loss": 0.9818,
+      "step": 1360
+    },
+    {
+      "epoch": 1.8538565629228687,
+      "grad_norm": 1.689642071723938,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 1370
+    },
+    {
+      "epoch": 1.8673883626522327,
+      "grad_norm": 1.3405762910842896,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 1380
+    },
+    {
+      "epoch": 1.8809201623815968,
+      "grad_norm": 1.187905192375183,
+      "learning_rate": 0.0002,
+      "loss": 0.9279,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8944519621109608,
+      "grad_norm": 1.403511643409729,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 1400
+    },
+    {
+      "epoch": 1.9079837618403248,
+      "grad_norm": 1.4245457649230957,
+      "learning_rate": 0.0002,
+      "loss": 0.9654,
+      "step": 1410
+    },
+    {
+      "epoch": 1.9215155615696888,
+      "grad_norm": 0.6742255687713623,
+      "learning_rate": 0.0002,
+      "loss": 0.9047,
+      "step": 1420
+    },
+    {
+      "epoch": 1.9350473612990529,
+      "grad_norm": 0.9301473498344421,
+      "learning_rate": 0.0002,
+      "loss": 0.9837,
+      "step": 1430
+    },
+    {
+      "epoch": 1.9485791610284169,
+      "grad_norm": 0.8039385080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.9579,
+      "step": 1440
+    },
+    {
+      "epoch": 1.962110960757781,
+      "grad_norm": 0.7449126839637756,
+      "learning_rate": 0.0002,
+      "loss": 0.9433,
+      "step": 1450
+    },
+    {
+      "epoch": 1.975642760487145,
+      "grad_norm": 1.8016695976257324,
+      "learning_rate": 0.0002,
+      "loss": 0.9424,
+      "step": 1460
+    },
+    {
+      "epoch": 1.989174560216509,
+      "grad_norm": 1.3347259759902954,
+      "learning_rate": 0.0002,
+      "loss": 0.9434,
+      "step": 1470
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.28184974193573,
+      "eval_runtime": 54.9872,
+      "eval_samples_per_second": 8.293,
+      "eval_steps_per_second": 1.037,
+      "step": 1478
+    },
+    {
+      "epoch": 2.002706359945873,
+      "grad_norm": 1.1238535642623901,
+      "learning_rate": 0.0002,
+      "loss": 0.8667,
+      "step": 1480
+    },
+    {
+      "epoch": 2.016238159675237,
+      "grad_norm": 0.9186404943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.741,
+      "step": 1490
+    },
+    {
+      "epoch": 2.029769959404601,
+      "grad_norm": 1.42500901222229,
+      "learning_rate": 0.0002,
+      "loss": 0.7402,
+      "step": 1500
+    },
+    {
+      "epoch": 2.0433017591339646,
+      "grad_norm": 0.9018896222114563,
+      "learning_rate": 0.0002,
+      "loss": 0.6594,
+      "step": 1510
+    },
+    {
+      "epoch": 2.0568335588633286,
+      "grad_norm": 0.9482589364051819,
+      "learning_rate": 0.0002,
+      "loss": 0.7973,
+      "step": 1520
+    },
+    {
+      "epoch": 2.0703653585926927,
+      "grad_norm": 1.7364821434020996,
+      "learning_rate": 0.0002,
+      "loss": 0.7494,
+      "step": 1530
+    },
+    {
+      "epoch": 2.0838971583220567,
+      "grad_norm": 1.1600096225738525,
+      "learning_rate": 0.0002,
+      "loss": 0.6691,
+      "step": 1540
+    },
+    {
+      "epoch": 2.0974289580514207,
+      "grad_norm": 1.1180989742279053,
+      "learning_rate": 0.0002,
+      "loss": 0.7443,
+      "step": 1550
+    },
+    {
+      "epoch": 2.1109607577807847,
+      "grad_norm": 0.7978046536445618,
+      "learning_rate": 0.0002,
+      "loss": 0.7261,
+      "step": 1560
+    },
+    {
+      "epoch": 2.1244925575101488,
+      "grad_norm": 0.9089515805244446,
+      "learning_rate": 0.0002,
+      "loss": 0.7659,
+      "step": 1570
+    },
+    {
+      "epoch": 2.138024357239513,
+      "grad_norm": 1.2802879810333252,
+      "learning_rate": 0.0002,
+      "loss": 0.7751,
+      "step": 1580
+    },
+    {
+      "epoch": 2.151556156968877,
+      "grad_norm": 1.1321839094161987,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1590
+    },
+    {
+      "epoch": 2.165087956698241,
+      "grad_norm": 0.9985150098800659,
+      "learning_rate": 0.0002,
+      "loss": 0.699,
+      "step": 1600
+    },
+    {
+      "epoch": 2.178619756427605,
+      "grad_norm": 1.1864978075027466,
+      "learning_rate": 0.0002,
+      "loss": 0.7497,
+      "step": 1610
+    },
+    {
+      "epoch": 2.192151556156969,
+      "grad_norm": 1.0220770835876465,
+      "learning_rate": 0.0002,
+      "loss": 0.7648,
+      "step": 1620
+    },
+    {
+      "epoch": 2.205683355886333,
+      "grad_norm": 1.075281023979187,
+      "learning_rate": 0.0002,
+      "loss": 0.7786,
+      "step": 1630
+    },
+    {
+      "epoch": 2.219215155615697,
+      "grad_norm": 1.7539390325546265,
+      "learning_rate": 0.0002,
+      "loss": 0.7169,
+      "step": 1640
+    },
+    {
+      "epoch": 2.232746955345061,
+      "grad_norm": 0.9327954053878784,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1650
+    },
+    {
+      "epoch": 2.246278755074425,
+      "grad_norm": 1.1239676475524902,
+      "learning_rate": 0.0002,
+      "loss": 0.7096,
+      "step": 1660
+    },
+    {
+      "epoch": 2.259810554803789,
+      "grad_norm": 0.887867271900177,
+      "learning_rate": 0.0002,
+      "loss": 0.7516,
+      "step": 1670
+    },
+    {
+      "epoch": 2.273342354533153,
+      "grad_norm": 0.9934070110321045,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1680
+    },
+    {
+      "epoch": 2.286874154262517,
+      "grad_norm": 1.1046375036239624,
+      "learning_rate": 0.0002,
+      "loss": 0.7891,
+      "step": 1690
+    },
+    {
+      "epoch": 2.300405953991881,
+      "grad_norm": 1.3520793914794922,
+      "learning_rate": 0.0002,
+      "loss": 0.7123,
+      "step": 1700
+    },
+    {
+      "epoch": 2.313937753721245,
+      "grad_norm": 1.0396424531936646,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 1710
+    },
+    {
+      "epoch": 2.3274695534506087,
+      "grad_norm": 1.312713861465454,
+      "learning_rate": 0.0002,
+      "loss": 0.7645,
+      "step": 1720
+    },
+    {
+      "epoch": 2.3410013531799727,
+      "grad_norm": 1.2425963878631592,
+      "learning_rate": 0.0002,
+      "loss": 0.7743,
+      "step": 1730
+    },
+    {
+      "epoch": 2.3545331529093367,
+      "grad_norm": 1.0335496664047241,
+      "learning_rate": 0.0002,
+      "loss": 0.7069,
+      "step": 1740
+    },
+    {
+      "epoch": 2.3680649526387008,
+      "grad_norm": 0.8289833664894104,
+      "learning_rate": 0.0002,
+      "loss": 0.763,
+      "step": 1750
+    },
+    {
+      "epoch": 2.381596752368065,
+      "grad_norm": 1.1725471019744873,
+      "learning_rate": 0.0002,
+      "loss": 0.749,
+      "step": 1760
+    },
+    {
+      "epoch": 2.395128552097429,
+      "grad_norm": 1.10824716091156,
+      "learning_rate": 0.0002,
+      "loss": 0.7842,
+      "step": 1770
+    },
+    {
+      "epoch": 2.408660351826793,
+      "grad_norm": 1.027957797050476,
+      "learning_rate": 0.0002,
+      "loss": 0.8115,
+      "step": 1780
+    },
+    {
+      "epoch": 2.422192151556157,
+      "grad_norm": 1.4744906425476074,
+      "learning_rate": 0.0002,
+      "loss": 0.7529,
+      "step": 1790
+    },
+    {
+      "epoch": 2.435723951285521,
+      "grad_norm": 2.044746160507202,
+      "learning_rate": 0.0002,
+      "loss": 0.7146,
+      "step": 1800
+    },
+    {
+      "epoch": 2.449255751014885,
+      "grad_norm": 0.9940636157989502,
+      "learning_rate": 0.0002,
+      "loss": 0.6738,
+      "step": 1810
+    },
+    {
+      "epoch": 2.462787550744249,
+      "grad_norm": 1.2338303327560425,
+      "learning_rate": 0.0002,
+      "loss": 0.8233,
+      "step": 1820
+    },
+    {
+      "epoch": 2.476319350473613,
+      "grad_norm": 1.1820061206817627,
+      "learning_rate": 0.0002,
+      "loss": 0.6409,
+      "step": 1830
+    },
+    {
+      "epoch": 2.489851150202977,
+      "grad_norm": 1.5557365417480469,
+      "learning_rate": 0.0002,
+      "loss": 0.7744,
+      "step": 1840
+    },
+    {
+      "epoch": 2.503382949932341,
+      "grad_norm": 0.927599310874939,
+      "learning_rate": 0.0002,
+      "loss": 0.742,
+      "step": 1850
+    },
+    {
+      "epoch": 2.516914749661705,
+      "grad_norm": 1.384813666343689,
+      "learning_rate": 0.0002,
+      "loss": 0.7683,
+      "step": 1860
+    },
+    {
+      "epoch": 2.530446549391069,
+      "grad_norm": 1.0022202730178833,
+      "learning_rate": 0.0002,
+      "loss": 0.7841,
+      "step": 1870
+    },
+    {
+      "epoch": 2.543978349120433,
+      "grad_norm": 1.0271503925323486,
+      "learning_rate": 0.0002,
+      "loss": 0.7422,
+      "step": 1880
+    },
+    {
+      "epoch": 2.557510148849797,
+      "grad_norm": 1.2724764347076416,
+      "learning_rate": 0.0002,
+      "loss": 0.7513,
+      "step": 1890
+    },
+    {
+      "epoch": 2.571041948579161,
+      "grad_norm": 0.9063859581947327,
+      "learning_rate": 0.0002,
+      "loss": 0.7594,
+      "step": 1900
+    },
+    {
+      "epoch": 2.584573748308525,
+      "grad_norm": 0.9433910250663757,
+      "learning_rate": 0.0002,
+      "loss": 0.7423,
+      "step": 1910
+    },
+    {
+      "epoch": 2.598105548037889,
+      "grad_norm": 0.8303482532501221,
+      "learning_rate": 0.0002,
+      "loss": 0.6833,
+      "step": 1920
+    },
+    {
+      "epoch": 2.611637347767253,
+      "grad_norm": 1.16862952709198,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 1930
+    },
+    {
+      "epoch": 2.6251691474966172,
+      "grad_norm": 0.8904703855514526,
+      "learning_rate": 0.0002,
+      "loss": 0.7544,
+      "step": 1940
+    },
+    {
+      "epoch": 2.6387009472259813,
+      "grad_norm": 1.2958505153656006,
+      "learning_rate": 0.0002,
+      "loss": 0.7339,
+      "step": 1950
+    },
+    {
+      "epoch": 2.6522327469553453,
+      "grad_norm": 1.2310389280319214,
+      "learning_rate": 0.0002,
+      "loss": 0.7502,
+      "step": 1960
+    },
+    {
+      "epoch": 2.6657645466847093,
+      "grad_norm": 1.3157947063446045,
+      "learning_rate": 0.0002,
+      "loss": 0.8305,
+      "step": 1970
+    },
+    {
+      "epoch": 2.6792963464140733,
+      "grad_norm": 0.9247841238975525,
+      "learning_rate": 0.0002,
+      "loss": 0.7348,
+      "step": 1980
+    },
+    {
+      "epoch": 2.6928281461434374,
+      "grad_norm": 0.9850119352340698,
+      "learning_rate": 0.0002,
+      "loss": 0.7352,
+      "step": 1990
+    },
+    {
+      "epoch": 2.706359945872801,
+      "grad_norm": 1.59624183177948,
+      "learning_rate": 0.0002,
+      "loss": 0.7794,
+      "step": 2000
+    },
+    {
+      "epoch": 2.719891745602165,
+      "grad_norm": 1.791932225227356,
+      "learning_rate": 0.0002,
+      "loss": 0.6918,
+      "step": 2010
+    },
+    {
+      "epoch": 2.733423545331529,
+      "grad_norm": 0.8530828356742859,
+      "learning_rate": 0.0002,
+      "loss": 0.7616,
+      "step": 2020
+    },
+    {
+      "epoch": 2.746955345060893,
+      "grad_norm": 1.431843638420105,
+      "learning_rate": 0.0002,
+      "loss": 0.8028,
+      "step": 2030
+    },
+    {
+      "epoch": 2.760487144790257,
+      "grad_norm": 1.1818324327468872,
+      "learning_rate": 0.0002,
+      "loss": 0.7403,
+      "step": 2040
+    },
+    {
+      "epoch": 2.774018944519621,
+      "grad_norm": 1.0456408262252808,
+      "learning_rate": 0.0002,
+      "loss": 0.6752,
+      "step": 2050
+    },
+    {
+      "epoch": 2.787550744248985,
+      "grad_norm": 1.5935403108596802,
+      "learning_rate": 0.0002,
+      "loss": 0.7771,
+      "step": 2060
+    },
+    {
+      "epoch": 2.801082543978349,
+      "grad_norm": 1.6653326749801636,
+      "learning_rate": 0.0002,
+      "loss": 0.722,
+      "step": 2070
+    },
+    {
+      "epoch": 2.814614343707713,
+      "grad_norm": 1.2409698963165283,
+      "learning_rate": 0.0002,
+      "loss": 0.7553,
+      "step": 2080
+    },
+    {
+      "epoch": 2.828146143437077,
+      "grad_norm": 0.8511452674865723,
+      "learning_rate": 0.0002,
+      "loss": 0.7483,
+      "step": 2090
+    },
+    {
+      "epoch": 2.841677943166441,
+      "grad_norm": 1.1064083576202393,
+      "learning_rate": 0.0002,
+      "loss": 0.7773,
+      "step": 2100
+    },
+    {
+      "epoch": 2.855209742895805,
+      "grad_norm": 1.5252450704574585,
+      "learning_rate": 0.0002,
+      "loss": 0.8025,
+      "step": 2110
+    },
+    {
+      "epoch": 2.8687415426251692,
+      "grad_norm": 1.8477630615234375,
+      "learning_rate": 0.0002,
+      "loss": 0.7342,
+      "step": 2120
+    },
+    {
+      "epoch": 2.8822733423545333,
+      "grad_norm": 1.8251630067825317,
+      "learning_rate": 0.0002,
+      "loss": 1.0005,
+      "step": 2130
+    },
+    {
+      "epoch": 2.8958051420838973,
+      "grad_norm": 2.0696771144866943,
+      "learning_rate": 0.0002,
+      "loss": 0.8036,
+      "step": 2140
+    },
+    {
+      "epoch": 2.9093369418132613,
+      "grad_norm": 1.0875508785247803,
+      "learning_rate": 0.0002,
+      "loss": 0.7561,
+      "step": 2150
+    },
+    {
+      "epoch": 2.9228687415426253,
+      "grad_norm": 1.1486080884933472,
+      "learning_rate": 0.0002,
+      "loss": 0.808,
+      "step": 2160
+    },
+    {
+      "epoch": 2.936400541271989,
+      "grad_norm": 1.3913694620132446,
+      "learning_rate": 0.0002,
+      "loss": 0.7418,
+      "step": 2170
+    },
+    {
+      "epoch": 2.949932341001353,
+      "grad_norm": 1.0237643718719482,
+      "learning_rate": 0.0002,
+      "loss": 0.7829,
+      "step": 2180
+    },
+    {
+      "epoch": 2.963464140730717,
+      "grad_norm": 1.0060926675796509,
+      "learning_rate": 0.0002,
+      "loss": 0.8717,
+      "step": 2190
+    },
+    {
+      "epoch": 2.976995940460081,
+      "grad_norm": 0.8395462036132812,
+      "learning_rate": 0.0002,
+      "loss": 0.7581,
+      "step": 2200
+    },
+    {
+      "epoch": 2.990527740189445,
+      "grad_norm": 1.4483158588409424,
+      "learning_rate": 0.0002,
+      "loss": 0.7032,
+      "step": 2210
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.3191189765930176,
+      "eval_runtime": 53.5323,
+      "eval_samples_per_second": 8.518,
+      "eval_steps_per_second": 1.065,
+      "step": 2217
+    },
+    {
+      "epoch": 3.004059539918809,
+      "grad_norm": 0.9717937111854553,
+      "learning_rate": 0.0002,
+      "loss": 0.6775,
+      "step": 2220
+    },
+    {
+      "epoch": 3.017591339648173,
+      "grad_norm": 1.2057876586914062,
+      "learning_rate": 0.0002,
+      "loss": 0.6059,
+      "step": 2230
+    },
+    {
+      "epoch": 3.031123139377537,
+      "grad_norm": 1.2295159101486206,
+      "learning_rate": 0.0002,
+      "loss": 0.5764,
+      "step": 2240
+    },
+    {
+      "epoch": 3.044654939106901,
+      "grad_norm": 1.1200335025787354,
+      "learning_rate": 0.0002,
+      "loss": 0.5088,
+      "step": 2250
+    },
+    {
+      "epoch": 3.058186738836265,
+      "grad_norm": 1.3113594055175781,
+      "learning_rate": 0.0002,
+      "loss": 0.5464,
+      "step": 2260
+    },
+    {
+      "epoch": 3.071718538565629,
+      "grad_norm": 1.3074201345443726,
+      "learning_rate": 0.0002,
+      "loss": 0.5959,
+      "step": 2270
+    },
+    {
+      "epoch": 3.085250338294993,
+      "grad_norm": 1.7636418342590332,
+      "learning_rate": 0.0002,
+      "loss": 0.642,
+      "step": 2280
+    },
+    {
+      "epoch": 3.098782138024357,
+      "grad_norm": 1.2225017547607422,
+      "learning_rate": 0.0002,
+      "loss": 0.5645,
+      "step": 2290
+    },
+    {
+      "epoch": 3.1123139377537212,
+      "grad_norm": 1.062538743019104,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 2300
+    },
+    {
+      "epoch": 3.1258457374830853,
+      "grad_norm": 1.9475018978118896,
+      "learning_rate": 0.0002,
+      "loss": 0.5426,
+      "step": 2310
+    },
+    {
+      "epoch": 3.1393775372124493,
+      "grad_norm": 1.3695366382598877,
+      "learning_rate": 0.0002,
+      "loss": 0.5028,
+      "step": 2320
+    },
+    {
+      "epoch": 3.1529093369418133,
+      "grad_norm": 1.4610179662704468,
+      "learning_rate": 0.0002,
+      "loss": 0.6278,
+      "step": 2330
+    },
+    {
+      "epoch": 3.1664411366711773,
+      "grad_norm": 1.1319258213043213,
+      "learning_rate": 0.0002,
+      "loss": 0.6062,
+      "step": 2340
+    },
+    {
+      "epoch": 3.1799729364005414,
+      "grad_norm": 1.8418315649032593,
+      "learning_rate": 0.0002,
+      "loss": 0.5946,
+      "step": 2350
+    },
+    {
+      "epoch": 3.1935047361299054,
+      "grad_norm": 1.0682015419006348,
+      "learning_rate": 0.0002,
+      "loss": 0.6215,
+      "step": 2360
+    },
+    {
+      "epoch": 3.2070365358592694,
+      "grad_norm": 0.9852792024612427,
+      "learning_rate": 0.0002,
+      "loss": 0.5431,
+      "step": 2370
+    },
+    {
+      "epoch": 3.2205683355886334,
+      "grad_norm": 1.447991967201233,
+      "learning_rate": 0.0002,
+      "loss": 0.561,
+      "step": 2380
+    },
+    {
+      "epoch": 3.2341001353179974,
+      "grad_norm": 1.3113367557525635,
+      "learning_rate": 0.0002,
+      "loss": 0.6143,
+      "step": 2390
+    },
+    {
+      "epoch": 3.2476319350473615,
+      "grad_norm": 1.412656307220459,
+      "learning_rate": 0.0002,
+      "loss": 0.6268,
+      "step": 2400
+    },
+    {
+      "epoch": 3.2611637347767255,
+      "grad_norm": 1.41526198387146,
+      "learning_rate": 0.0002,
+      "loss": 0.5883,
+      "step": 2410
+    },
+    {
+      "epoch": 3.2746955345060895,
+      "grad_norm": 1.5622785091400146,
+      "learning_rate": 0.0002,
+      "loss": 0.5242,
+      "step": 2420
+    },
+    {
+      "epoch": 3.2882273342354535,
+      "grad_norm": 1.6155788898468018,
+      "learning_rate": 0.0002,
+      "loss": 0.5536,
+      "step": 2430
+    },
+    {
+      "epoch": 3.301759133964817,
+      "grad_norm": 1.4699913263320923,
+      "learning_rate": 0.0002,
+      "loss": 0.5464,
+      "step": 2440
+    },
+    {
+      "epoch": 3.315290933694181,
+      "grad_norm": 1.0095789432525635,
+      "learning_rate": 0.0002,
+      "loss": 0.6074,
+      "step": 2450
+    },
+    {
+      "epoch": 3.328822733423545,
+      "grad_norm": 1.620950698852539,
+      "learning_rate": 0.0002,
+      "loss": 0.5316,
+      "step": 2460
+    },
+    {
+      "epoch": 3.342354533152909,
+      "grad_norm": 1.4491326808929443,
+      "learning_rate": 0.0002,
+      "loss": 0.6617,
+      "step": 2470
+    },
+    {
+      "epoch": 3.3558863328822732,
+      "grad_norm": 1.9128118753433228,
+      "learning_rate": 0.0002,
+      "loss": 0.5639,
+      "step": 2480
+    },
+    {
+      "epoch": 3.3694181326116373,
+      "grad_norm": 1.36688232421875,
+      "learning_rate": 0.0002,
+      "loss": 0.5958,
+      "step": 2490
+    },
+    {
+      "epoch": 3.3829499323410013,
+      "grad_norm": 1.455443263053894,
+      "learning_rate": 0.0002,
+      "loss": 0.5432,
+      "step": 2500
+    },
+    {
+      "epoch": 3.3964817320703653,
+      "grad_norm": 1.2894777059555054,
+      "learning_rate": 0.0002,
+      "loss": 0.6697,
+      "step": 2510
+    },
+    {
+      "epoch": 3.4100135317997293,
+      "grad_norm": 1.3889403343200684,
+      "learning_rate": 0.0002,
+      "loss": 0.5893,
+      "step": 2520
+    },
+    {
+      "epoch": 3.4235453315290933,
+      "grad_norm": 1.4315358400344849,
+      "learning_rate": 0.0002,
+      "loss": 0.5148,
+      "step": 2530
+    },
+    {
+      "epoch": 3.4370771312584574,
+      "grad_norm": 1.3308886289596558,
+      "learning_rate": 0.0002,
+      "loss": 0.5766,
+      "step": 2540
+    },
+    {
+      "epoch": 3.4506089309878214,
+      "grad_norm": 1.2735179662704468,
+      "learning_rate": 0.0002,
+      "loss": 0.6168,
+      "step": 2550
+    },
+    {
+      "epoch": 3.4641407307171854,
+      "grad_norm": 1.2731887102127075,
+      "learning_rate": 0.0002,
+      "loss": 0.6133,
+      "step": 2560
+    },
+    {
+      "epoch": 3.4776725304465494,
+      "grad_norm": 2.390596628189087,
+      "learning_rate": 0.0002,
+      "loss": 0.5956,
+      "step": 2570
+    },
+    {
+      "epoch": 3.4912043301759135,
+      "grad_norm": 1.3651424646377563,
+      "learning_rate": 0.0002,
+      "loss": 0.576,
+      "step": 2580
+    },
+    {
+      "epoch": 3.5047361299052775,
+      "grad_norm": 0.9903562068939209,
+      "learning_rate": 0.0002,
+      "loss": 0.5456,
+      "step": 2590
+    },
+    {
+      "epoch": 3.5182679296346415,
+      "grad_norm": 1.467106580734253,
+      "learning_rate": 0.0002,
+      "loss": 0.6323,
+      "step": 2600
+    },
+    {
+      "epoch": 3.5317997293640055,
+      "grad_norm": 1.4800456762313843,
+      "learning_rate": 0.0002,
+      "loss": 0.6195,
+      "step": 2610
+    },
+    {
+      "epoch": 3.5453315290933696,
+      "grad_norm": 1.140714406967163,
+      "learning_rate": 0.0002,
+      "loss": 0.5971,
+      "step": 2620
+    },
+    {
+      "epoch": 3.558863328822733,
+      "grad_norm": 2.1062142848968506,
+      "learning_rate": 0.0002,
+      "loss": 0.6463,
+      "step": 2630
+    },
+    {
+      "epoch": 3.572395128552097,
+      "grad_norm": 1.3074438571929932,
+      "learning_rate": 0.0002,
+      "loss": 0.5909,
+      "step": 2640
+    },
+    {
+      "epoch": 3.585926928281461,
+      "grad_norm": 1.80443274974823,
+      "learning_rate": 0.0002,
+      "loss": 0.6123,
+      "step": 2650
+    },
+    {
+      "epoch": 3.5994587280108252,
+      "grad_norm": 1.0620969533920288,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 2660
+    },
+    {
+      "epoch": 3.6129905277401893,
+      "grad_norm": 1.3793504238128662,
+      "learning_rate": 0.0002,
+      "loss": 0.6084,
+      "step": 2670
+    },
+    {
+      "epoch": 3.6265223274695533,
+      "grad_norm": 1.0759015083312988,
+      "learning_rate": 0.0002,
+      "loss": 0.615,
+      "step": 2680
+    },
+    {
+      "epoch": 3.6400541271989173,
+      "grad_norm": 1.5374208688735962,
+      "learning_rate": 0.0002,
+      "loss": 0.6319,
+      "step": 2690
+    },
+    {
+      "epoch": 3.6535859269282813,
+      "grad_norm": 1.690587043762207,
+      "learning_rate": 0.0002,
+      "loss": 0.5328,
+      "step": 2700
+    },
+    {
+      "epoch": 3.6671177266576453,
+      "grad_norm": 1.2092949151992798,
+      "learning_rate": 0.0002,
+      "loss": 0.5715,
+      "step": 2710
+    },
+    {
+      "epoch": 3.6806495263870094,
+      "grad_norm": 1.8789589405059814,
+      "learning_rate": 0.0002,
+      "loss": 0.6063,
+      "step": 2720
+    },
+    {
+      "epoch": 3.6941813261163734,
+      "grad_norm": 1.5840286016464233,
+      "learning_rate": 0.0002,
+      "loss": 0.5823,
+      "step": 2730
+    },
+    {
+      "epoch": 3.7077131258457374,
+      "grad_norm": 1.3318506479263306,
+      "learning_rate": 0.0002,
+      "loss": 0.6039,
+      "step": 2740
+    },
+    {
+      "epoch": 3.7212449255751014,
+      "grad_norm": 1.0107663869857788,
+      "learning_rate": 0.0002,
+      "loss": 0.6488,
+      "step": 2750
+    },
+    {
+      "epoch": 3.7347767253044655,
+      "grad_norm": 1.152219295501709,
+      "learning_rate": 0.0002,
+      "loss": 0.6657,
+      "step": 2760
+    },
+    {
+      "epoch": 3.7483085250338295,
+      "grad_norm": 1.4025444984436035,
+      "learning_rate": 0.0002,
+      "loss": 0.5845,
+      "step": 2770
+    },
+    {
+      "epoch": 3.7618403247631935,
+      "grad_norm": 0.9559378623962402,
+      "learning_rate": 0.0002,
+      "loss": 0.6215,
+      "step": 2780
+    },
+    {
+      "epoch": 3.7753721244925575,
+      "grad_norm": 1.196541428565979,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 2790
+    },
+    {
+      "epoch": 3.7889039242219216,
+      "grad_norm": 1.0485719442367554,
+      "learning_rate": 0.0002,
+      "loss": 0.5686,
+      "step": 2800
+    },
+    {
+      "epoch": 3.8024357239512856,
+      "grad_norm": 1.3199235200881958,
+      "learning_rate": 0.0002,
+      "loss": 0.557,
+      "step": 2810
+    },
+    {
+      "epoch": 3.8159675236806496,
+      "grad_norm": 1.8519755601882935,
+      "learning_rate": 0.0002,
+      "loss": 0.5797,
+      "step": 2820
+    },
+    {
+      "epoch": 3.8294993234100136,
+      "grad_norm": 1.3234314918518066,
+      "learning_rate": 0.0002,
+      "loss": 0.6194,
+      "step": 2830
+    },
+    {
+      "epoch": 3.8430311231393777,
+      "grad_norm": 1.5337995290756226,
+      "learning_rate": 0.0002,
+      "loss": 0.6192,
+      "step": 2840
+    },
+    {
+      "epoch": 3.8565629228687417,
+      "grad_norm": 1.3527625799179077,
+      "learning_rate": 0.0002,
+      "loss": 0.5753,
+      "step": 2850
+    },
+    {
+      "epoch": 3.8700947225981057,
+      "grad_norm": 1.1479119062423706,
+      "learning_rate": 0.0002,
+      "loss": 0.6199,
+      "step": 2860
+    },
+    {
+      "epoch": 3.8836265223274697,
+      "grad_norm": 1.8172897100448608,
+      "learning_rate": 0.0002,
+      "loss": 0.5928,
+      "step": 2870
+    },
+    {
+      "epoch": 3.8971583220568338,
+      "grad_norm": 2.219006061553955,
+      "learning_rate": 0.0002,
+      "loss": 0.6224,
+      "step": 2880
+    },
+    {
+      "epoch": 3.910690121786198,
+      "grad_norm": 1.1499899625778198,
+      "learning_rate": 0.0002,
+      "loss": 0.6196,
+      "step": 2890
+    },
+    {
+      "epoch": 3.924221921515562,
+      "grad_norm": 1.2255879640579224,
+      "learning_rate": 0.0002,
+      "loss": 0.6052,
+      "step": 2900
+    },
+    {
+      "epoch": 3.937753721244926,
+      "grad_norm": 1.3766648769378662,
+      "learning_rate": 0.0002,
+      "loss": 0.6153,
+      "step": 2910
+    },
+    {
+      "epoch": 3.9512855209742894,
+      "grad_norm": 1.5438952445983887,
+      "learning_rate": 0.0002,
+      "loss": 0.5182,
+      "step": 2920
+    },
+    {
+      "epoch": 3.9648173207036534,
+      "grad_norm": 1.904476523399353,
+      "learning_rate": 0.0002,
+      "loss": 0.5805,
+      "step": 2930
+    },
+    {
+      "epoch": 3.9783491204330175,
+      "grad_norm": 1.5277420282363892,
+      "learning_rate": 0.0002,
+      "loss": 0.6441,
+      "step": 2940
+    },
+    {
+      "epoch": 3.9918809201623815,
+      "grad_norm": 1.4065558910369873,
+      "learning_rate": 0.0002,
+      "loss": 0.6155,
+      "step": 2950
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.4007929563522339,
+      "eval_runtime": 46.5624,
+      "eval_samples_per_second": 9.793,
+      "eval_steps_per_second": 1.224,
+      "step": 2956
+    },
+    {
+      "epoch": 4.005412719891746,
+      "grad_norm": 1.1447370052337646,
+      "learning_rate": 0.0002,
+      "loss": 0.621,
+      "step": 2960
+    },
+    {
+      "epoch": 4.01894451962111,
+      "grad_norm": 2.373656749725342,
+      "learning_rate": 0.0002,
+      "loss": 0.4338,
+      "step": 2970
+    },
+    {
+      "epoch": 4.032476319350474,
+      "grad_norm": 1.6204203367233276,
+      "learning_rate": 0.0002,
+      "loss": 0.4464,
+      "step": 2980
+    },
+    {
+      "epoch": 4.046008119079838,
+      "grad_norm": 1.4535361528396606,
+      "learning_rate": 0.0002,
+      "loss": 0.4758,
+      "step": 2990
+    },
+    {
+      "epoch": 4.059539918809202,
+      "grad_norm": 1.4007105827331543,
+      "learning_rate": 0.0002,
+      "loss": 0.5019,
+      "step": 3000
+    },
+    {
+      "epoch": 4.073071718538565,
+      "grad_norm": 1.429174542427063,
+      "learning_rate": 0.0002,
+      "loss": 0.4113,
+      "step": 3010
+    },
+    {
+      "epoch": 4.086603518267929,
+      "grad_norm": 2.3854527473449707,
+      "learning_rate": 0.0002,
+      "loss": 0.4813,
+      "step": 3020
+    },
+    {
+      "epoch": 4.100135317997293,
+      "grad_norm": 1.4506710767745972,
+      "learning_rate": 0.0002,
+      "loss": 0.4133,
+      "step": 3030
+    },
+    {
+      "epoch": 4.113667117726657,
+      "grad_norm": 1.753843069076538,
+      "learning_rate": 0.0002,
+      "loss": 0.4594,
+      "step": 3040
+    },
+    {
+      "epoch": 4.127198917456021,
+      "grad_norm": 1.2288557291030884,
+      "learning_rate": 0.0002,
+      "loss": 0.4558,
+      "step": 3050
+    },
+    {
+      "epoch": 4.140730717185385,
+      "grad_norm": 1.6144076585769653,
+      "learning_rate": 0.0002,
+      "loss": 0.4575,
+      "step": 3060
+    },
+    {
+      "epoch": 4.154262516914749,
+      "grad_norm": 1.3560758829116821,
+      "learning_rate": 0.0002,
+      "loss": 0.469,
+      "step": 3070
+    },
+    {
+      "epoch": 4.167794316644113,
+      "grad_norm": 2.0955488681793213,
+      "learning_rate": 0.0002,
+      "loss": 0.4419,
+      "step": 3080
+    },
+    {
+      "epoch": 4.181326116373477,
+      "grad_norm": 1.7745214700698853,
+      "learning_rate": 0.0002,
+      "loss": 0.413,
+      "step": 3090
+    },
+    {
+      "epoch": 4.194857916102841,
+      "grad_norm": 1.6405600309371948,
+      "learning_rate": 0.0002,
+      "loss": 0.5026,
+      "step": 3100
+    },
+    {
+      "epoch": 4.2083897158322054,
+      "grad_norm": 2.1980135440826416,
+      "learning_rate": 0.0002,
+      "loss": 0.5206,
+      "step": 3110
+    },
+    {
+      "epoch": 4.2219215155615695,
+      "grad_norm": 1.497697114944458,
+      "learning_rate": 0.0002,
+      "loss": 0.4846,
+      "step": 3120
+    },
+    {
+      "epoch": 4.2354533152909335,
+      "grad_norm": 2.131647825241089,
+      "learning_rate": 0.0002,
+      "loss": 0.4402,
+      "step": 3130
+    },
+    {
+      "epoch": 4.2489851150202975,
+      "grad_norm": 1.530166745185852,
+      "learning_rate": 0.0002,
+      "loss": 0.4419,
+      "step": 3140
+    },
+    {
+      "epoch": 4.2625169147496615,
+      "grad_norm": 1.891534686088562,
+      "learning_rate": 0.0002,
+      "loss": 0.4958,
+      "step": 3150
+    },
+    {
+      "epoch": 4.276048714479026,
+      "grad_norm": 1.6743305921554565,
+      "learning_rate": 0.0002,
+      "loss": 0.4452,
+      "step": 3160
+    },
+    {
+      "epoch": 4.28958051420839,
+      "grad_norm": 1.0215026140213013,
+      "learning_rate": 0.0002,
+      "loss": 0.4832,
+      "step": 3170
+    },
+    {
+      "epoch": 4.303112313937754,
+      "grad_norm": 1.2947571277618408,
+      "learning_rate": 0.0002,
+      "loss": 0.4753,
+      "step": 3180
+    },
+    {
+      "epoch": 4.316644113667118,
+      "grad_norm": 1.3008885383605957,
+      "learning_rate": 0.0002,
+      "loss": 0.4642,
+      "step": 3190
+    },
+    {
+      "epoch": 4.330175913396482,
+      "grad_norm": 1.5144374370574951,
+      "learning_rate": 0.0002,
+      "loss": 0.4942,
+      "step": 3200
+    },
+    {
+      "epoch": 4.343707713125846,
+      "grad_norm": 1.5779962539672852,
+      "learning_rate": 0.0002,
+      "loss": 0.4567,
+      "step": 3210
+    },
+    {
+      "epoch": 4.35723951285521,
+      "grad_norm": 1.264953851699829,
+      "learning_rate": 0.0002,
+      "loss": 0.4748,
+      "step": 3220
+    },
+    {
+      "epoch": 4.370771312584574,
+      "grad_norm": 2.3533711433410645,
+      "learning_rate": 0.0002,
+      "loss": 0.4494,
+      "step": 3230
+    },
+    {
+      "epoch": 4.384303112313938,
+      "grad_norm": 1.2202765941619873,
+      "learning_rate": 0.0002,
+      "loss": 0.5197,
+      "step": 3240
+    },
+    {
+      "epoch": 4.397834912043302,
+      "grad_norm": 1.5369930267333984,
+      "learning_rate": 0.0002,
+      "loss": 0.5446,
+      "step": 3250
+    },
+    {
+      "epoch": 4.411366711772666,
+      "grad_norm": 1.2333588600158691,
+      "learning_rate": 0.0002,
+      "loss": 0.4825,
+      "step": 3260
+    },
+    {
+      "epoch": 4.42489851150203,
+      "grad_norm": 1.7053014039993286,
+      "learning_rate": 0.0002,
+      "loss": 0.486,
+      "step": 3270
+    },
+    {
+      "epoch": 4.438430311231394,
+      "grad_norm": 1.5808049440383911,
+      "learning_rate": 0.0002,
+      "loss": 0.4026,
+      "step": 3280
+    },
+    {
+      "epoch": 4.451962110960758,
+      "grad_norm": 1.5628689527511597,
+      "learning_rate": 0.0002,
+      "loss": 0.4689,
+      "step": 3290
+    },
+    {
+      "epoch": 4.465493910690122,
+      "grad_norm": 1.531670331954956,
+      "learning_rate": 0.0002,
+      "loss": 0.4748,
+      "step": 3300
+    },
+    {
+      "epoch": 4.479025710419486,
+      "grad_norm": 1.3667949438095093,
+      "learning_rate": 0.0002,
+      "loss": 0.4898,
+      "step": 3310
+    },
+    {
+      "epoch": 4.49255751014885,
+      "grad_norm": 1.4732868671417236,
+      "learning_rate": 0.0002,
+      "loss": 0.4669,
+      "step": 3320
+    },
+    {
+      "epoch": 4.506089309878214,
+      "grad_norm": 1.3093361854553223,
+      "learning_rate": 0.0002,
+      "loss": 0.4876,
+      "step": 3330
+    },
+    {
+      "epoch": 4.519621109607578,
+      "grad_norm": 1.349886178970337,
+      "learning_rate": 0.0002,
+      "loss": 0.4783,
+      "step": 3340
+    },
+    {
+      "epoch": 4.533152909336942,
+      "grad_norm": 1.8089256286621094,
+      "learning_rate": 0.0002,
+      "loss": 0.5042,
+      "step": 3350
+    },
+    {
+      "epoch": 4.546684709066306,
+      "grad_norm": 1.96787691116333,
+      "learning_rate": 0.0002,
+      "loss": 0.4917,
+      "step": 3360
+    },
+    {
+      "epoch": 4.56021650879567,
+      "grad_norm": 1.7355002164840698,
+      "learning_rate": 0.0002,
+      "loss": 0.4525,
+      "step": 3370
+    },
+    {
+      "epoch": 4.573748308525034,
+      "grad_norm": 2.0564498901367188,
+      "learning_rate": 0.0002,
+      "loss": 0.4611,
+      "step": 3380
+    },
+    {
+      "epoch": 4.587280108254398,
+      "grad_norm": 1.9259542226791382,
+      "learning_rate": 0.0002,
+      "loss": 0.4858,
+      "step": 3390
+    },
+    {
+      "epoch": 4.600811907983762,
+      "grad_norm": 1.7264310121536255,
+      "learning_rate": 0.0002,
+      "loss": 0.4652,
+      "step": 3400
+    },
+    {
+      "epoch": 4.614343707713126,
+      "grad_norm": 1.2677029371261597,
+      "learning_rate": 0.0002,
+      "loss": 0.4905,
+      "step": 3410
+    },
+    {
+      "epoch": 4.62787550744249,
+      "grad_norm": 1.4264763593673706,
+      "learning_rate": 0.0002,
+      "loss": 0.4806,
+      "step": 3420
+    },
+    {
+      "epoch": 4.641407307171854,
+      "grad_norm": 1.6065561771392822,
+      "learning_rate": 0.0002,
+      "loss": 0.4323,
+      "step": 3430
+    },
+    {
+      "epoch": 4.654939106901217,
+      "grad_norm": 1.4936598539352417,
+      "learning_rate": 0.0002,
+      "loss": 0.4915,
+      "step": 3440
+    },
+    {
+      "epoch": 4.668470906630581,
+      "grad_norm": 1.4462144374847412,
+      "learning_rate": 0.0002,
+      "loss": 0.5243,
+      "step": 3450
+    },
+    {
+      "epoch": 4.682002706359945,
+      "grad_norm": 1.3262397050857544,
+      "learning_rate": 0.0002,
+      "loss": 0.5087,
+      "step": 3460
+    },
+    {
+      "epoch": 4.6955345060893094,
+      "grad_norm": 1.5079004764556885,
+      "learning_rate": 0.0002,
+      "loss": 0.5273,
+      "step": 3470
+    },
+    {
+      "epoch": 4.7090663058186735,
+      "grad_norm": 1.8043315410614014,
+      "learning_rate": 0.0002,
+      "loss": 0.5132,
+      "step": 3480
+    },
+    {
+      "epoch": 4.7225981055480375,
+      "grad_norm": 1.2165871858596802,
+      "learning_rate": 0.0002,
+      "loss": 0.4798,
+      "step": 3490
+    },
+    {
+      "epoch": 4.7361299052774015,
+      "grad_norm": 1.6533914804458618,
+      "learning_rate": 0.0002,
+      "loss": 0.4715,
+      "step": 3500
+    },
+    {
+      "epoch": 4.7496617050067655,
+      "grad_norm": 1.299289345741272,
+      "learning_rate": 0.0002,
+      "loss": 0.5074,
+      "step": 3510
+    },
+    {
+      "epoch": 4.76319350473613,
+      "grad_norm": 1.6199723482131958,
+      "learning_rate": 0.0002,
+      "loss": 0.4694,
+      "step": 3520
+    },
+    {
+      "epoch": 4.776725304465494,
+      "grad_norm": 1.6506156921386719,
+      "learning_rate": 0.0002,
+      "loss": 0.4804,
+      "step": 3530
+    },
+    {
+      "epoch": 4.790257104194858,
+      "grad_norm": 1.6849974393844604,
+      "learning_rate": 0.0002,
+      "loss": 0.4814,
+      "step": 3540
+    },
+    {
+      "epoch": 4.803788903924222,
+      "grad_norm": 1.4276493787765503,
+      "learning_rate": 0.0002,
+      "loss": 0.4856,
+      "step": 3550
+    },
+    {
+      "epoch": 4.817320703653586,
+      "grad_norm": 1.4023104906082153,
+      "learning_rate": 0.0002,
+      "loss": 0.5246,
+      "step": 3560
+    },
+    {
+      "epoch": 4.83085250338295,
+      "grad_norm": 1.4798460006713867,
+      "learning_rate": 0.0002,
+      "loss": 0.5098,
+      "step": 3570
+    },
+    {
+      "epoch": 4.844384303112314,
+      "grad_norm": 1.3377588987350464,
+      "learning_rate": 0.0002,
+      "loss": 0.5079,
+      "step": 3580
+    },
+    {
+      "epoch": 4.857916102841678,
+      "grad_norm": 1.5201970338821411,
+      "learning_rate": 0.0002,
+      "loss": 0.4986,
+      "step": 3590
+    },
+    {
+      "epoch": 4.871447902571042,
+      "grad_norm": 1.4653739929199219,
+      "learning_rate": 0.0002,
+      "loss": 0.5066,
+      "step": 3600
+    },
+    {
+      "epoch": 4.884979702300406,
+      "grad_norm": 1.411512017250061,
+      "learning_rate": 0.0002,
+      "loss": 0.5035,
+      "step": 3610
+    },
+    {
+      "epoch": 4.89851150202977,
+      "grad_norm": 1.6721467971801758,
+      "learning_rate": 0.0002,
+      "loss": 0.5169,
+      "step": 3620
+    },
+    {
+      "epoch": 4.912043301759134,
+      "grad_norm": 1.4436850547790527,
+      "learning_rate": 0.0002,
+      "loss": 0.518,
+      "step": 3630
+    },
+    {
+      "epoch": 4.925575101488498,
+      "grad_norm": 1.5604814291000366,
+      "learning_rate": 0.0002,
+      "loss": 0.4647,
+      "step": 3640
+    },
+    {
+      "epoch": 4.939106901217862,
+      "grad_norm": 2.3246071338653564,
+      "learning_rate": 0.0002,
+      "loss": 0.5032,
+      "step": 3650
+    },
+    {
+      "epoch": 4.952638700947226,
+      "grad_norm": 1.637837290763855,
+      "learning_rate": 0.0002,
+      "loss": 0.4877,
+      "step": 3660
+    },
+    {
+      "epoch": 4.96617050067659,
+      "grad_norm": 1.6687514781951904,
+      "learning_rate": 0.0002,
+      "loss": 0.4932,
+      "step": 3670
+    },
+    {
+      "epoch": 4.979702300405954,
+      "grad_norm": 1.3703943490982056,
+      "learning_rate": 0.0002,
+      "loss": 0.5254,
+      "step": 3680
+    },
+    {
+      "epoch": 4.993234100135318,
+      "grad_norm": 2.1465952396392822,
+      "learning_rate": 0.0002,
+      "loss": 0.5059,
+      "step": 3690
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.4866586923599243,
+      "eval_runtime": 46.5706,
+      "eval_samples_per_second": 9.792,
+      "eval_steps_per_second": 1.224,
+      "step": 3695
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5912,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.62117115183104e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}