MilaWang commited on Mar 28, 2025

Commit

e0ed810

verified ·

1 Parent(s): 7b66e2b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/trainer_state.json +874 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/trainer_state.json +1722 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/trainer_state.json +2563 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-3586/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-4782/trainer_state.json +3411 -0

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6fb23e021c82b32b6c2408988634560d7bd603a296f019014af1d02dc1b61a7
+size 109069176

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e8c3cdda853e4a91a58112eaa00a74bd2496ae92198e6ace6646e35f6255e2a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac5e70252675d8711219ad53f8adb101aeb01d9515529143ff26642df7c06080
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eef4305d3bc2e5edc63c72e269bd7b7a3eee5986ab682dbf582aa59aa0034a91
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dcfc6ff8012f4bd1bc8fc47f524009810398acd2f545d8a5d52748b6e0f936d
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,874 @@

+{
+  "best_metric": 1.1974399089813232,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-1195",
+  "epoch": 0.999581764951903,
+  "eval_steps": 10,
+  "global_step": 1195,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008364700961940611,
+      "grad_norm": 1.2661447525024414,
+      "learning_rate": 0.0002,
+      "loss": 1.9268,
+      "step": 10
+    },
+    {
+      "epoch": 0.016729401923881223,
+      "grad_norm": 1.3240571022033691,
+      "learning_rate": 0.0002,
+      "loss": 1.6326,
+      "step": 20
+    },
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 0.7347124218940735,
+      "learning_rate": 0.0002,
+      "loss": 1.507,
+      "step": 30
+    },
+    {
+      "epoch": 0.033458803847762446,
+      "grad_norm": 0.9849194288253784,
+      "learning_rate": 0.0002,
+      "loss": 1.5363,
+      "step": 40
+    },
+    {
+      "epoch": 0.04182350480970305,
+      "grad_norm": 0.9093025326728821,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 50
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 0.737514078617096,
+      "learning_rate": 0.0002,
+      "loss": 1.2542,
+      "step": 60
+    },
+    {
+      "epoch": 0.05855290673358427,
+      "grad_norm": 1.3245333433151245,
+      "learning_rate": 0.0002,
+      "loss": 1.2172,
+      "step": 70
+    },
+    {
+      "epoch": 0.06691760769552489,
+      "grad_norm": 0.7806007862091064,
+      "learning_rate": 0.0002,
+      "loss": 1.2478,
+      "step": 80
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 0.6627500057220459,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 90
+    },
+    {
+      "epoch": 0.0836470096194061,
+      "grad_norm": 1.0278682708740234,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 100
+    },
+    {
+      "epoch": 0.09201171058134672,
+      "grad_norm": 0.7746474146842957,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 110
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 0.5935637950897217,
+      "learning_rate": 0.0002,
+      "loss": 1.1171,
+      "step": 120
+    },
+    {
+      "epoch": 0.10874111250522794,
+      "grad_norm": 0.6738003492355347,
+      "learning_rate": 0.0002,
+      "loss": 1.1625,
+      "step": 130
+    },
+    {
+      "epoch": 0.11710581346716854,
+      "grad_norm": 0.6447349190711975,
+      "learning_rate": 0.0002,
+      "loss": 1.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 0.5628684759140015,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 150
+    },
+    {
+      "epoch": 0.13383521539104978,
+      "grad_norm": 0.7446871399879456,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 160
+    },
+    {
+      "epoch": 0.14219991635299037,
+      "grad_norm": 0.5214279294013977,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 170
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 0.5324464440345764,
+      "learning_rate": 0.0002,
+      "loss": 1.2612,
+      "step": 180
+    },
+    {
+      "epoch": 0.1589293182768716,
+      "grad_norm": 0.5539828538894653,
+      "learning_rate": 0.0002,
+      "loss": 1.3168,
+      "step": 190
+    },
+    {
+      "epoch": 0.1672940192388122,
+      "grad_norm": 0.5192331671714783,
+      "learning_rate": 0.0002,
+      "loss": 1.0835,
+      "step": 200
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 0.7160783410072327,
+      "learning_rate": 0.0002,
+      "loss": 1.1799,
+      "step": 210
+    },
+    {
+      "epoch": 0.18402342116269343,
+      "grad_norm": 0.8428353071212769,
+      "learning_rate": 0.0002,
+      "loss": 1.1527,
+      "step": 220
+    },
+    {
+      "epoch": 0.19238812212463405,
+      "grad_norm": 0.493561714887619,
+      "learning_rate": 0.0002,
+      "loss": 1.1284,
+      "step": 230
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 2.522308111190796,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 240
+    },
+    {
+      "epoch": 0.20911752404851527,
+      "grad_norm": 0.7338423728942871,
+      "learning_rate": 0.0002,
+      "loss": 1.1459,
+      "step": 250
+    },
+    {
+      "epoch": 0.2174822250104559,
+      "grad_norm": 0.6501832604408264,
+      "learning_rate": 0.0002,
+      "loss": 1.1311,
+      "step": 260
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.6331472992897034,
+      "learning_rate": 0.0002,
+      "loss": 1.2241,
+      "step": 270
+    },
+    {
+      "epoch": 0.23421162693433709,
+      "grad_norm": 0.5653548836708069,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 280
+    },
+    {
+      "epoch": 0.2425763278962777,
+      "grad_norm": 0.5833444595336914,
+      "learning_rate": 0.0002,
+      "loss": 1.119,
+      "step": 290
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.6707335114479065,
+      "learning_rate": 0.0002,
+      "loss": 1.2157,
+      "step": 300
+    },
+    {
+      "epoch": 0.2593057298201589,
+      "grad_norm": 0.5435659885406494,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 310
+    },
+    {
+      "epoch": 0.26767043078209957,
+      "grad_norm": 0.5752334594726562,
+      "learning_rate": 0.0002,
+      "loss": 1.0781,
+      "step": 320
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 0.5790163278579712,
+      "learning_rate": 0.0002,
+      "loss": 1.0493,
+      "step": 330
+    },
+    {
+      "epoch": 0.28439983270598074,
+      "grad_norm": 0.46593040227890015,
+      "learning_rate": 0.0002,
+      "loss": 1.2281,
+      "step": 340
+    },
+    {
+      "epoch": 0.2927645336679214,
+      "grad_norm": 0.7713788151741028,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 350
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 0.7719253301620483,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 360
+    },
+    {
+      "epoch": 0.3094939355918026,
+      "grad_norm": 0.7065562605857849,
+      "learning_rate": 0.0002,
+      "loss": 1.0884,
+      "step": 370
+    },
+    {
+      "epoch": 0.3178586365537432,
+      "grad_norm": 0.7082679271697998,
+      "learning_rate": 0.0002,
+      "loss": 1.0902,
+      "step": 380
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.5779536366462708,
+      "learning_rate": 0.0002,
+      "loss": 1.1696,
+      "step": 390
+    },
+    {
+      "epoch": 0.3345880384776244,
+      "grad_norm": 0.6321173310279846,
+      "learning_rate": 0.0002,
+      "loss": 1.1916,
+      "step": 400
+    },
+    {
+      "epoch": 0.34295273943956506,
+      "grad_norm": 0.7237968444824219,
+      "learning_rate": 0.0002,
+      "loss": 1.1419,
+      "step": 410
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 0.6730817556381226,
+      "learning_rate": 0.0002,
+      "loss": 0.9877,
+      "step": 420
+    },
+    {
+      "epoch": 0.3596821413634463,
+      "grad_norm": 0.6245285868644714,
+      "learning_rate": 0.0002,
+      "loss": 1.108,
+      "step": 430
+    },
+    {
+      "epoch": 0.36804684232538687,
+      "grad_norm": 0.9926134347915649,
+      "learning_rate": 0.0002,
+      "loss": 1.209,
+      "step": 440
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 0.5567468404769897,
+      "learning_rate": 0.0002,
+      "loss": 1.0664,
+      "step": 450
+    },
+    {
+      "epoch": 0.3847762442492681,
+      "grad_norm": 0.5764540433883667,
+      "learning_rate": 0.0002,
+      "loss": 1.1838,
+      "step": 460
+    },
+    {
+      "epoch": 0.3931409452112087,
+      "grad_norm": 1.1908321380615234,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 470
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 0.6756157875061035,
+      "learning_rate": 0.0002,
+      "loss": 1.1601,
+      "step": 480
+    },
+    {
+      "epoch": 0.4098703471350899,
+      "grad_norm": 0.5793355107307434,
+      "learning_rate": 0.0002,
+      "loss": 1.1703,
+      "step": 490
+    },
+    {
+      "epoch": 0.41823504809703055,
+      "grad_norm": 0.6145297288894653,
+      "learning_rate": 0.0002,
+      "loss": 1.1289,
+      "step": 500
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 0.48073795437812805,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 510
+    },
+    {
+      "epoch": 0.4349644500209118,
+      "grad_norm": 0.802431046962738,
+      "learning_rate": 0.0002,
+      "loss": 1.1335,
+      "step": 520
+    },
+    {
+      "epoch": 0.44332915098285236,
+      "grad_norm": 0.5906000137329102,
+      "learning_rate": 0.0002,
+      "loss": 1.0574,
+      "step": 530
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5615521669387817,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 540
+    },
+    {
+      "epoch": 0.4600585529067336,
+      "grad_norm": 0.5688650012016296,
+      "learning_rate": 0.0002,
+      "loss": 1.2228,
+      "step": 550
+    },
+    {
+      "epoch": 0.46842325386867417,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 560
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.6905680298805237,
+      "learning_rate": 0.0002,
+      "loss": 1.1566,
+      "step": 570
+    },
+    {
+      "epoch": 0.4851526557925554,
+      "grad_norm": 0.5885183811187744,
+      "learning_rate": 0.0002,
+      "loss": 1.1256,
+      "step": 580
+    },
+    {
+      "epoch": 0.49351735675449604,
+      "grad_norm": 0.7367458343505859,
+      "learning_rate": 0.0002,
+      "loss": 1.211,
+      "step": 590
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 0.9157859086990356,
+      "learning_rate": 0.0002,
+      "loss": 1.1215,
+      "step": 600
+    },
+    {
+      "epoch": 0.5102467586783772,
+      "grad_norm": 0.49971529841423035,
+      "learning_rate": 0.0002,
+      "loss": 1.3101,
+      "step": 610
+    },
+    {
+      "epoch": 0.5186114596403179,
+      "grad_norm": 0.5031328797340393,
+      "learning_rate": 0.0002,
+      "loss": 1.1223,
+      "step": 620
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 0.6945798397064209,
+      "learning_rate": 0.0002,
+      "loss": 1.154,
+      "step": 630
+    },
+    {
+      "epoch": 0.5353408615641991,
+      "grad_norm": 0.7563218474388123,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 640
+    },
+    {
+      "epoch": 0.5437055625261397,
+      "grad_norm": 0.9215132594108582,
+      "learning_rate": 0.0002,
+      "loss": 1.2364,
+      "step": 650
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 1.0132478475570679,
+      "learning_rate": 0.0002,
+      "loss": 1.2179,
+      "step": 660
+    },
+    {
+      "epoch": 0.560434964450021,
+      "grad_norm": 1.448024868965149,
+      "learning_rate": 0.0002,
+      "loss": 1.1016,
+      "step": 670
+    },
+    {
+      "epoch": 0.5687996654119615,
+      "grad_norm": 0.7022866010665894,
+      "learning_rate": 0.0002,
+      "loss": 1.1918,
+      "step": 680
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 0.7366224527359009,
+      "learning_rate": 0.0002,
+      "loss": 1.1108,
+      "step": 690
+    },
+    {
+      "epoch": 0.5855290673358428,
+      "grad_norm": 0.722874641418457,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 700
+    },
+    {
+      "epoch": 0.5938937682977834,
+      "grad_norm": 1.0756473541259766,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 710
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.607101559638977,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 720
+    },
+    {
+      "epoch": 0.6106231702216646,
+      "grad_norm": 0.7424359917640686,
+      "learning_rate": 0.0002,
+      "loss": 1.1561,
+      "step": 730
+    },
+    {
+      "epoch": 0.6189878711836052,
+      "grad_norm": 0.7123169898986816,
+      "learning_rate": 0.0002,
+      "loss": 1.1124,
+      "step": 740
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 0.672195315361023,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 750
+    },
+    {
+      "epoch": 0.6357172731074864,
+      "grad_norm": 0.8329780697822571,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 760
+    },
+    {
+      "epoch": 0.644081974069427,
+      "grad_norm": 0.7011522650718689,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 770
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 0.6425889730453491,
+      "learning_rate": 0.0002,
+      "loss": 1.2505,
+      "step": 780
+    },
+    {
+      "epoch": 0.6608113759933082,
+      "grad_norm": 0.8729137182235718,
+      "learning_rate": 0.0002,
+      "loss": 1.2005,
+      "step": 790
+    },
+    {
+      "epoch": 0.6691760769552488,
+      "grad_norm": 0.5885024070739746,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 800
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.526979386806488,
+      "learning_rate": 0.0002,
+      "loss": 1.1901,
+      "step": 810
+    },
+    {
+      "epoch": 0.6859054788791301,
+      "grad_norm": 0.998365044593811,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 820
+    },
+    {
+      "epoch": 0.6942701798410706,
+      "grad_norm": 0.6049501299858093,
+      "learning_rate": 0.0002,
+      "loss": 1.0278,
+      "step": 830
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 0.7015583515167236,
+      "learning_rate": 0.0002,
+      "loss": 1.1102,
+      "step": 840
+    },
+    {
+      "epoch": 0.7109995817649519,
+      "grad_norm": 0.5852547883987427,
+      "learning_rate": 0.0002,
+      "loss": 1.1041,
+      "step": 850
+    },
+    {
+      "epoch": 0.7193642827268926,
+      "grad_norm": 0.6017204523086548,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 860
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.7195692658424377,
+      "learning_rate": 0.0002,
+      "loss": 1.0611,
+      "step": 870
+    },
+    {
+      "epoch": 0.7360936846507737,
+      "grad_norm": 0.8087519407272339,
+      "learning_rate": 0.0002,
+      "loss": 1.1497,
+      "step": 880
+    },
+    {
+      "epoch": 0.7444583856127144,
+      "grad_norm": 0.988362193107605,
+      "learning_rate": 0.0002,
+      "loss": 1.1087,
+      "step": 890
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.6142330765724182,
+      "learning_rate": 0.0002,
+      "loss": 1.049,
+      "step": 900
+    },
+    {
+      "epoch": 0.7611877875365956,
+      "grad_norm": 0.6751818656921387,
+      "learning_rate": 0.0002,
+      "loss": 1.0388,
+      "step": 910
+    },
+    {
+      "epoch": 0.7695524884985362,
+      "grad_norm": 0.7528653740882874,
+      "learning_rate": 0.0002,
+      "loss": 1.2125,
+      "step": 920
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.613039493560791,
+      "learning_rate": 0.0002,
+      "loss": 0.9926,
+      "step": 930
+    },
+    {
+      "epoch": 0.7862818904224174,
+      "grad_norm": 0.8040242791175842,
+      "learning_rate": 0.0002,
+      "loss": 1.2582,
+      "step": 940
+    },
+    {
+      "epoch": 0.794646591384358,
+      "grad_norm": 0.5306838154792786,
+      "learning_rate": 0.0002,
+      "loss": 1.1397,
+      "step": 950
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.7037438750267029,
+      "learning_rate": 0.0002,
+      "loss": 1.0303,
+      "step": 960
+    },
+    {
+      "epoch": 0.8113759933082393,
+      "grad_norm": 0.6726985573768616,
+      "learning_rate": 0.0002,
+      "loss": 1.1531,
+      "step": 970
+    },
+    {
+      "epoch": 0.8197406942701798,
+      "grad_norm": 0.9324426651000977,
+      "learning_rate": 0.0002,
+      "loss": 1.125,
+      "step": 980
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.5811492204666138,
+      "learning_rate": 0.0002,
+      "loss": 1.0744,
+      "step": 990
+    },
+    {
+      "epoch": 0.8364700961940611,
+      "grad_norm": 0.6894899606704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8448347971560016,
+      "grad_norm": 0.5663559436798096,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 0.5555400252342224,
+      "learning_rate": 0.0002,
+      "loss": 1.0337,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8615641990798829,
+      "grad_norm": 0.4418621063232422,
+      "learning_rate": 0.0002,
+      "loss": 1.1086,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8699289000418235,
+      "grad_norm": 0.7832980751991272,
+      "learning_rate": 0.0002,
+      "loss": 1.1291,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.6883782744407654,
+      "learning_rate": 0.0002,
+      "loss": 1.1538,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8866583019657047,
+      "grad_norm": 0.5617508888244629,
+      "learning_rate": 0.0002,
+      "loss": 1.0311,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8950230029276454,
+      "grad_norm": 0.723233699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1869,
+      "step": 1070
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 2.8922297954559326,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9117524048515265,
+      "grad_norm": 1.5861668586730957,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9201171058134672,
+      "grad_norm": 0.6625565886497498,
+      "learning_rate": 0.0002,
+      "loss": 1.0758,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.6424002647399902,
+      "learning_rate": 0.0002,
+      "loss": 1.2524,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9368465077373483,
+      "grad_norm": 0.7253570556640625,
+      "learning_rate": 0.0002,
+      "loss": 1.0261,
+      "step": 1120
+    },
+    {
+      "epoch": 0.945211208699289,
+      "grad_norm": 0.6529237627983093,
+      "learning_rate": 0.0002,
+      "loss": 1.2131,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.7082931399345398,
+      "learning_rate": 0.0002,
+      "loss": 1.0705,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9619406106231703,
+      "grad_norm": 1.10663902759552,
+      "learning_rate": 0.0002,
+      "loss": 1.2197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9703053115851108,
+      "grad_norm": 0.6979895830154419,
+      "learning_rate": 0.0002,
+      "loss": 1.1051,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.896873950958252,
+      "learning_rate": 0.0002,
+      "loss": 1.1516,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9870347135089921,
+      "grad_norm": 0.5664224624633789,
+      "learning_rate": 0.0002,
+      "loss": 1.0224,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9953994144709327,
+      "grad_norm": 0.6827336549758911,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 1190
+    },
+    {
+      "epoch": 0.999581764951903,
+      "eval_loss": 1.1974399089813232,
+      "eval_runtime": 83.0008,
+      "eval_samples_per_second": 5.494,
+      "eval_steps_per_second": 0.687,
+      "step": 1195
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9560,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.24522357784576e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82cc1a869a13836981e3f5a21d92f839005da543aa938bca6e96fe51edb97f77
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6fb23e021c82b32b6c2408988634560d7bd603a296f019014af1d02dc1b61a7
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:183ecd15c3bceef2b91da9fbe7e2142b8be320970c114d258575807b7071e67a
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a99f1f99fa86b1f7a1ff9d40c9d7edf03237855c42276f991d620cb3ce038a37
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:defea0a4ffa14d7f115a494411fb4ba866efbebcbc1cdf06c54ec34a1ba15933
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1722 @@

+{
+  "best_metric": 1.1764153242111206,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 2391,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008364700961940611,
+      "grad_norm": 1.2661447525024414,
+      "learning_rate": 0.0002,
+      "loss": 1.9268,
+      "step": 10
+    },
+    {
+      "epoch": 0.016729401923881223,
+      "grad_norm": 1.3240571022033691,
+      "learning_rate": 0.0002,
+      "loss": 1.6326,
+      "step": 20
+    },
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 0.7347124218940735,
+      "learning_rate": 0.0002,
+      "loss": 1.507,
+      "step": 30
+    },
+    {
+      "epoch": 0.033458803847762446,
+      "grad_norm": 0.9849194288253784,
+      "learning_rate": 0.0002,
+      "loss": 1.5363,
+      "step": 40
+    },
+    {
+      "epoch": 0.04182350480970305,
+      "grad_norm": 0.9093025326728821,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 50
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 0.737514078617096,
+      "learning_rate": 0.0002,
+      "loss": 1.2542,
+      "step": 60
+    },
+    {
+      "epoch": 0.05855290673358427,
+      "grad_norm": 1.3245333433151245,
+      "learning_rate": 0.0002,
+      "loss": 1.2172,
+      "step": 70
+    },
+    {
+      "epoch": 0.06691760769552489,
+      "grad_norm": 0.7806007862091064,
+      "learning_rate": 0.0002,
+      "loss": 1.2478,
+      "step": 80
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 0.6627500057220459,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 90
+    },
+    {
+      "epoch": 0.0836470096194061,
+      "grad_norm": 1.0278682708740234,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 100
+    },
+    {
+      "epoch": 0.09201171058134672,
+      "grad_norm": 0.7746474146842957,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 110
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 0.5935637950897217,
+      "learning_rate": 0.0002,
+      "loss": 1.1171,
+      "step": 120
+    },
+    {
+      "epoch": 0.10874111250522794,
+      "grad_norm": 0.6738003492355347,
+      "learning_rate": 0.0002,
+      "loss": 1.1625,
+      "step": 130
+    },
+    {
+      "epoch": 0.11710581346716854,
+      "grad_norm": 0.6447349190711975,
+      "learning_rate": 0.0002,
+      "loss": 1.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 0.5628684759140015,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 150
+    },
+    {
+      "epoch": 0.13383521539104978,
+      "grad_norm": 0.7446871399879456,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 160
+    },
+    {
+      "epoch": 0.14219991635299037,
+      "grad_norm": 0.5214279294013977,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 170
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 0.5324464440345764,
+      "learning_rate": 0.0002,
+      "loss": 1.2612,
+      "step": 180
+    },
+    {
+      "epoch": 0.1589293182768716,
+      "grad_norm": 0.5539828538894653,
+      "learning_rate": 0.0002,
+      "loss": 1.3168,
+      "step": 190
+    },
+    {
+      "epoch": 0.1672940192388122,
+      "grad_norm": 0.5192331671714783,
+      "learning_rate": 0.0002,
+      "loss": 1.0835,
+      "step": 200
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 0.7160783410072327,
+      "learning_rate": 0.0002,
+      "loss": 1.1799,
+      "step": 210
+    },
+    {
+      "epoch": 0.18402342116269343,
+      "grad_norm": 0.8428353071212769,
+      "learning_rate": 0.0002,
+      "loss": 1.1527,
+      "step": 220
+    },
+    {
+      "epoch": 0.19238812212463405,
+      "grad_norm": 0.493561714887619,
+      "learning_rate": 0.0002,
+      "loss": 1.1284,
+      "step": 230
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 2.522308111190796,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 240
+    },
+    {
+      "epoch": 0.20911752404851527,
+      "grad_norm": 0.7338423728942871,
+      "learning_rate": 0.0002,
+      "loss": 1.1459,
+      "step": 250
+    },
+    {
+      "epoch": 0.2174822250104559,
+      "grad_norm": 0.6501832604408264,
+      "learning_rate": 0.0002,
+      "loss": 1.1311,
+      "step": 260
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.6331472992897034,
+      "learning_rate": 0.0002,
+      "loss": 1.2241,
+      "step": 270
+    },
+    {
+      "epoch": 0.23421162693433709,
+      "grad_norm": 0.5653548836708069,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 280
+    },
+    {
+      "epoch": 0.2425763278962777,
+      "grad_norm": 0.5833444595336914,
+      "learning_rate": 0.0002,
+      "loss": 1.119,
+      "step": 290
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.6707335114479065,
+      "learning_rate": 0.0002,
+      "loss": 1.2157,
+      "step": 300
+    },
+    {
+      "epoch": 0.2593057298201589,
+      "grad_norm": 0.5435659885406494,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 310
+    },
+    {
+      "epoch": 0.26767043078209957,
+      "grad_norm": 0.5752334594726562,
+      "learning_rate": 0.0002,
+      "loss": 1.0781,
+      "step": 320
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 0.5790163278579712,
+      "learning_rate": 0.0002,
+      "loss": 1.0493,
+      "step": 330
+    },
+    {
+      "epoch": 0.28439983270598074,
+      "grad_norm": 0.46593040227890015,
+      "learning_rate": 0.0002,
+      "loss": 1.2281,
+      "step": 340
+    },
+    {
+      "epoch": 0.2927645336679214,
+      "grad_norm": 0.7713788151741028,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 350
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 0.7719253301620483,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 360
+    },
+    {
+      "epoch": 0.3094939355918026,
+      "grad_norm": 0.7065562605857849,
+      "learning_rate": 0.0002,
+      "loss": 1.0884,
+      "step": 370
+    },
+    {
+      "epoch": 0.3178586365537432,
+      "grad_norm": 0.7082679271697998,
+      "learning_rate": 0.0002,
+      "loss": 1.0902,
+      "step": 380
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.5779536366462708,
+      "learning_rate": 0.0002,
+      "loss": 1.1696,
+      "step": 390
+    },
+    {
+      "epoch": 0.3345880384776244,
+      "grad_norm": 0.6321173310279846,
+      "learning_rate": 0.0002,
+      "loss": 1.1916,
+      "step": 400
+    },
+    {
+      "epoch": 0.34295273943956506,
+      "grad_norm": 0.7237968444824219,
+      "learning_rate": 0.0002,
+      "loss": 1.1419,
+      "step": 410
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 0.6730817556381226,
+      "learning_rate": 0.0002,
+      "loss": 0.9877,
+      "step": 420
+    },
+    {
+      "epoch": 0.3596821413634463,
+      "grad_norm": 0.6245285868644714,
+      "learning_rate": 0.0002,
+      "loss": 1.108,
+      "step": 430
+    },
+    {
+      "epoch": 0.36804684232538687,
+      "grad_norm": 0.9926134347915649,
+      "learning_rate": 0.0002,
+      "loss": 1.209,
+      "step": 440
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 0.5567468404769897,
+      "learning_rate": 0.0002,
+      "loss": 1.0664,
+      "step": 450
+    },
+    {
+      "epoch": 0.3847762442492681,
+      "grad_norm": 0.5764540433883667,
+      "learning_rate": 0.0002,
+      "loss": 1.1838,
+      "step": 460
+    },
+    {
+      "epoch": 0.3931409452112087,
+      "grad_norm": 1.1908321380615234,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 470
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 0.6756157875061035,
+      "learning_rate": 0.0002,
+      "loss": 1.1601,
+      "step": 480
+    },
+    {
+      "epoch": 0.4098703471350899,
+      "grad_norm": 0.5793355107307434,
+      "learning_rate": 0.0002,
+      "loss": 1.1703,
+      "step": 490
+    },
+    {
+      "epoch": 0.41823504809703055,
+      "grad_norm": 0.6145297288894653,
+      "learning_rate": 0.0002,
+      "loss": 1.1289,
+      "step": 500
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 0.48073795437812805,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 510
+    },
+    {
+      "epoch": 0.4349644500209118,
+      "grad_norm": 0.802431046962738,
+      "learning_rate": 0.0002,
+      "loss": 1.1335,
+      "step": 520
+    },
+    {
+      "epoch": 0.44332915098285236,
+      "grad_norm": 0.5906000137329102,
+      "learning_rate": 0.0002,
+      "loss": 1.0574,
+      "step": 530
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5615521669387817,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 540
+    },
+    {
+      "epoch": 0.4600585529067336,
+      "grad_norm": 0.5688650012016296,
+      "learning_rate": 0.0002,
+      "loss": 1.2228,
+      "step": 550
+    },
+    {
+      "epoch": 0.46842325386867417,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 560
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.6905680298805237,
+      "learning_rate": 0.0002,
+      "loss": 1.1566,
+      "step": 570
+    },
+    {
+      "epoch": 0.4851526557925554,
+      "grad_norm": 0.5885183811187744,
+      "learning_rate": 0.0002,
+      "loss": 1.1256,
+      "step": 580
+    },
+    {
+      "epoch": 0.49351735675449604,
+      "grad_norm": 0.7367458343505859,
+      "learning_rate": 0.0002,
+      "loss": 1.211,
+      "step": 590
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 0.9157859086990356,
+      "learning_rate": 0.0002,
+      "loss": 1.1215,
+      "step": 600
+    },
+    {
+      "epoch": 0.5102467586783772,
+      "grad_norm": 0.49971529841423035,
+      "learning_rate": 0.0002,
+      "loss": 1.3101,
+      "step": 610
+    },
+    {
+      "epoch": 0.5186114596403179,
+      "grad_norm": 0.5031328797340393,
+      "learning_rate": 0.0002,
+      "loss": 1.1223,
+      "step": 620
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 0.6945798397064209,
+      "learning_rate": 0.0002,
+      "loss": 1.154,
+      "step": 630
+    },
+    {
+      "epoch": 0.5353408615641991,
+      "grad_norm": 0.7563218474388123,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 640
+    },
+    {
+      "epoch": 0.5437055625261397,
+      "grad_norm": 0.9215132594108582,
+      "learning_rate": 0.0002,
+      "loss": 1.2364,
+      "step": 650
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 1.0132478475570679,
+      "learning_rate": 0.0002,
+      "loss": 1.2179,
+      "step": 660
+    },
+    {
+      "epoch": 0.560434964450021,
+      "grad_norm": 1.448024868965149,
+      "learning_rate": 0.0002,
+      "loss": 1.1016,
+      "step": 670
+    },
+    {
+      "epoch": 0.5687996654119615,
+      "grad_norm": 0.7022866010665894,
+      "learning_rate": 0.0002,
+      "loss": 1.1918,
+      "step": 680
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 0.7366224527359009,
+      "learning_rate": 0.0002,
+      "loss": 1.1108,
+      "step": 690
+    },
+    {
+      "epoch": 0.5855290673358428,
+      "grad_norm": 0.722874641418457,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 700
+    },
+    {
+      "epoch": 0.5938937682977834,
+      "grad_norm": 1.0756473541259766,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 710
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.607101559638977,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 720
+    },
+    {
+      "epoch": 0.6106231702216646,
+      "grad_norm": 0.7424359917640686,
+      "learning_rate": 0.0002,
+      "loss": 1.1561,
+      "step": 730
+    },
+    {
+      "epoch": 0.6189878711836052,
+      "grad_norm": 0.7123169898986816,
+      "learning_rate": 0.0002,
+      "loss": 1.1124,
+      "step": 740
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 0.672195315361023,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 750
+    },
+    {
+      "epoch": 0.6357172731074864,
+      "grad_norm": 0.8329780697822571,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 760
+    },
+    {
+      "epoch": 0.644081974069427,
+      "grad_norm": 0.7011522650718689,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 770
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 0.6425889730453491,
+      "learning_rate": 0.0002,
+      "loss": 1.2505,
+      "step": 780
+    },
+    {
+      "epoch": 0.6608113759933082,
+      "grad_norm": 0.8729137182235718,
+      "learning_rate": 0.0002,
+      "loss": 1.2005,
+      "step": 790
+    },
+    {
+      "epoch": 0.6691760769552488,
+      "grad_norm": 0.5885024070739746,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 800
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.526979386806488,
+      "learning_rate": 0.0002,
+      "loss": 1.1901,
+      "step": 810
+    },
+    {
+      "epoch": 0.6859054788791301,
+      "grad_norm": 0.998365044593811,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 820
+    },
+    {
+      "epoch": 0.6942701798410706,
+      "grad_norm": 0.6049501299858093,
+      "learning_rate": 0.0002,
+      "loss": 1.0278,
+      "step": 830
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 0.7015583515167236,
+      "learning_rate": 0.0002,
+      "loss": 1.1102,
+      "step": 840
+    },
+    {
+      "epoch": 0.7109995817649519,
+      "grad_norm": 0.5852547883987427,
+      "learning_rate": 0.0002,
+      "loss": 1.1041,
+      "step": 850
+    },
+    {
+      "epoch": 0.7193642827268926,
+      "grad_norm": 0.6017204523086548,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 860
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.7195692658424377,
+      "learning_rate": 0.0002,
+      "loss": 1.0611,
+      "step": 870
+    },
+    {
+      "epoch": 0.7360936846507737,
+      "grad_norm": 0.8087519407272339,
+      "learning_rate": 0.0002,
+      "loss": 1.1497,
+      "step": 880
+    },
+    {
+      "epoch": 0.7444583856127144,
+      "grad_norm": 0.988362193107605,
+      "learning_rate": 0.0002,
+      "loss": 1.1087,
+      "step": 890
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.6142330765724182,
+      "learning_rate": 0.0002,
+      "loss": 1.049,
+      "step": 900
+    },
+    {
+      "epoch": 0.7611877875365956,
+      "grad_norm": 0.6751818656921387,
+      "learning_rate": 0.0002,
+      "loss": 1.0388,
+      "step": 910
+    },
+    {
+      "epoch": 0.7695524884985362,
+      "grad_norm": 0.7528653740882874,
+      "learning_rate": 0.0002,
+      "loss": 1.2125,
+      "step": 920
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.613039493560791,
+      "learning_rate": 0.0002,
+      "loss": 0.9926,
+      "step": 930
+    },
+    {
+      "epoch": 0.7862818904224174,
+      "grad_norm": 0.8040242791175842,
+      "learning_rate": 0.0002,
+      "loss": 1.2582,
+      "step": 940
+    },
+    {
+      "epoch": 0.794646591384358,
+      "grad_norm": 0.5306838154792786,
+      "learning_rate": 0.0002,
+      "loss": 1.1397,
+      "step": 950
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.7037438750267029,
+      "learning_rate": 0.0002,
+      "loss": 1.0303,
+      "step": 960
+    },
+    {
+      "epoch": 0.8113759933082393,
+      "grad_norm": 0.6726985573768616,
+      "learning_rate": 0.0002,
+      "loss": 1.1531,
+      "step": 970
+    },
+    {
+      "epoch": 0.8197406942701798,
+      "grad_norm": 0.9324426651000977,
+      "learning_rate": 0.0002,
+      "loss": 1.125,
+      "step": 980
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.5811492204666138,
+      "learning_rate": 0.0002,
+      "loss": 1.0744,
+      "step": 990
+    },
+    {
+      "epoch": 0.8364700961940611,
+      "grad_norm": 0.6894899606704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8448347971560016,
+      "grad_norm": 0.5663559436798096,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 0.5555400252342224,
+      "learning_rate": 0.0002,
+      "loss": 1.0337,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8615641990798829,
+      "grad_norm": 0.4418621063232422,
+      "learning_rate": 0.0002,
+      "loss": 1.1086,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8699289000418235,
+      "grad_norm": 0.7832980751991272,
+      "learning_rate": 0.0002,
+      "loss": 1.1291,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.6883782744407654,
+      "learning_rate": 0.0002,
+      "loss": 1.1538,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8866583019657047,
+      "grad_norm": 0.5617508888244629,
+      "learning_rate": 0.0002,
+      "loss": 1.0311,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8950230029276454,
+      "grad_norm": 0.723233699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1869,
+      "step": 1070
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 2.8922297954559326,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9117524048515265,
+      "grad_norm": 1.5861668586730957,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9201171058134672,
+      "grad_norm": 0.6625565886497498,
+      "learning_rate": 0.0002,
+      "loss": 1.0758,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.6424002647399902,
+      "learning_rate": 0.0002,
+      "loss": 1.2524,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9368465077373483,
+      "grad_norm": 0.7253570556640625,
+      "learning_rate": 0.0002,
+      "loss": 1.0261,
+      "step": 1120
+    },
+    {
+      "epoch": 0.945211208699289,
+      "grad_norm": 0.6529237627983093,
+      "learning_rate": 0.0002,
+      "loss": 1.2131,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.7082931399345398,
+      "learning_rate": 0.0002,
+      "loss": 1.0705,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9619406106231703,
+      "grad_norm": 1.10663902759552,
+      "learning_rate": 0.0002,
+      "loss": 1.2197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9703053115851108,
+      "grad_norm": 0.6979895830154419,
+      "learning_rate": 0.0002,
+      "loss": 1.1051,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.896873950958252,
+      "learning_rate": 0.0002,
+      "loss": 1.1516,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9870347135089921,
+      "grad_norm": 0.5664224624633789,
+      "learning_rate": 0.0002,
+      "loss": 1.0224,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9953994144709327,
+      "grad_norm": 0.6827336549758911,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 1190
+    },
+    {
+      "epoch": 0.999581764951903,
+      "eval_loss": 1.1974399089813232,
+      "eval_runtime": 83.0008,
+      "eval_samples_per_second": 5.494,
+      "eval_steps_per_second": 0.687,
+      "step": 1195
+    },
+    {
+      "epoch": 1.0037641154328734,
+      "grad_norm": 0.5443172454833984,
+      "learning_rate": 0.0002,
+      "loss": 1.0743,
+      "step": 1200
+    },
+    {
+      "epoch": 1.012128816394814,
+      "grad_norm": 0.71578449010849,
+      "learning_rate": 0.0002,
+      "loss": 0.9491,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0204935173567544,
+      "grad_norm": 0.681245744228363,
+      "learning_rate": 0.0002,
+      "loss": 0.8987,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0288582183186952,
+      "grad_norm": 0.5959660410881042,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0372229192806357,
+      "grad_norm": 0.581801176071167,
+      "learning_rate": 0.0002,
+      "loss": 0.9537,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0455876202425762,
+      "grad_norm": 0.6427032947540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9291,
+      "step": 1250
+    },
+    {
+      "epoch": 1.053952321204517,
+      "grad_norm": 1.2949297428131104,
+      "learning_rate": 0.0002,
+      "loss": 1.0572,
+      "step": 1260
+    },
+    {
+      "epoch": 1.0623170221664575,
+      "grad_norm": 0.7161147594451904,
+      "learning_rate": 0.0002,
+      "loss": 0.876,
+      "step": 1270
+    },
+    {
+      "epoch": 1.070681723128398,
+      "grad_norm": 0.8515461087226868,
+      "learning_rate": 0.0002,
+      "loss": 0.9624,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0790464240903388,
+      "grad_norm": 0.9086605906486511,
+      "learning_rate": 0.0002,
+      "loss": 1.0332,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0874111250522793,
+      "grad_norm": 0.525374174118042,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1300
+    },
+    {
+      "epoch": 1.09577582601422,
+      "grad_norm": 0.6631740927696228,
+      "learning_rate": 0.0002,
+      "loss": 0.987,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1041405269761606,
+      "grad_norm": 0.8387110233306885,
+      "learning_rate": 0.0002,
+      "loss": 1.0077,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1125052279381011,
+      "grad_norm": 0.8402808308601379,
+      "learning_rate": 0.0002,
+      "loss": 1.0299,
+      "step": 1330
+    },
+    {
+      "epoch": 1.120869928900042,
+      "grad_norm": 0.6945340037345886,
+      "learning_rate": 0.0002,
+      "loss": 0.9625,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1292346298619824,
+      "grad_norm": 0.6942460536956787,
+      "learning_rate": 0.0002,
+      "loss": 0.9181,
+      "step": 1350
+    },
+    {
+      "epoch": 1.137599330823923,
+      "grad_norm": 0.7074856758117676,
+      "learning_rate": 0.0002,
+      "loss": 1.0279,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1459640317858637,
+      "grad_norm": 0.6957907676696777,
+      "learning_rate": 0.0002,
+      "loss": 0.9177,
+      "step": 1370
+    },
+    {
+      "epoch": 1.1543287327478042,
+      "grad_norm": 0.7241228818893433,
+      "learning_rate": 0.0002,
+      "loss": 1.0561,
+      "step": 1380
+    },
+    {
+      "epoch": 1.162693433709745,
+      "grad_norm": 1.2119261026382446,
+      "learning_rate": 0.0002,
+      "loss": 0.974,
+      "step": 1390
+    },
+    {
+      "epoch": 1.1710581346716855,
+      "grad_norm": 0.7284879684448242,
+      "learning_rate": 0.0002,
+      "loss": 0.9813,
+      "step": 1400
+    },
+    {
+      "epoch": 1.179422835633626,
+      "grad_norm": 0.702438473701477,
+      "learning_rate": 0.0002,
+      "loss": 0.9153,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1877875365955668,
+      "grad_norm": 0.9390414357185364,
+      "learning_rate": 0.0002,
+      "loss": 1.0409,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1961522375575073,
+      "grad_norm": 0.8179782629013062,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2045169385194479,
+      "grad_norm": 1.4885749816894531,
+      "learning_rate": 0.0002,
+      "loss": 1.0606,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2128816394813886,
+      "grad_norm": 0.868131697177887,
+      "learning_rate": 0.0002,
+      "loss": 0.9497,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2212463404433291,
+      "grad_norm": 0.8125514388084412,
+      "learning_rate": 0.0002,
+      "loss": 0.9398,
+      "step": 1460
+    },
+    {
+      "epoch": 1.2296110414052697,
+      "grad_norm": 0.633736789226532,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2379757423672104,
+      "grad_norm": 0.6061311364173889,
+      "learning_rate": 0.0002,
+      "loss": 0.9484,
+      "step": 1480
+    },
+    {
+      "epoch": 1.246340443329151,
+      "grad_norm": 0.6683570742607117,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2547051442910915,
+      "grad_norm": 0.6832399964332581,
+      "learning_rate": 0.0002,
+      "loss": 0.9645,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2630698452530322,
+      "grad_norm": 0.7690117955207825,
+      "learning_rate": 0.0002,
+      "loss": 0.9892,
+      "step": 1510
+    },
+    {
+      "epoch": 1.2714345462149728,
+      "grad_norm": 0.7987741231918335,
+      "learning_rate": 0.0002,
+      "loss": 1.0383,
+      "step": 1520
+    },
+    {
+      "epoch": 1.2797992471769133,
+      "grad_norm": 0.527604877948761,
+      "learning_rate": 0.0002,
+      "loss": 0.9531,
+      "step": 1530
+    },
+    {
+      "epoch": 1.288163948138854,
+      "grad_norm": 0.6243641376495361,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2965286491007946,
+      "grad_norm": 0.7621095776557922,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1550
+    },
+    {
+      "epoch": 1.3048933500627353,
+      "grad_norm": 0.7913159728050232,
+      "learning_rate": 0.0002,
+      "loss": 1.0546,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3132580510246759,
+      "grad_norm": 0.9507867693901062,
+      "learning_rate": 0.0002,
+      "loss": 0.9793,
+      "step": 1570
+    },
+    {
+      "epoch": 1.3216227519866166,
+      "grad_norm": 0.7301706075668335,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3299874529485571,
+      "grad_norm": 0.7653141021728516,
+      "learning_rate": 0.0002,
+      "loss": 1.0031,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3383521539104977,
+      "grad_norm": 0.6372700333595276,
+      "learning_rate": 0.0002,
+      "loss": 0.8704,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3467168548724384,
+      "grad_norm": 1.7866026163101196,
+      "learning_rate": 0.0002,
+      "loss": 1.0373,
+      "step": 1610
+    },
+    {
+      "epoch": 1.355081555834379,
+      "grad_norm": 0.6353244781494141,
+      "learning_rate": 0.0002,
+      "loss": 0.9118,
+      "step": 1620
+    },
+    {
+      "epoch": 1.3634462567963195,
+      "grad_norm": 0.7673062086105347,
+      "learning_rate": 0.0002,
+      "loss": 1.0048,
+      "step": 1630
+    },
+    {
+      "epoch": 1.3718109577582602,
+      "grad_norm": 1.1364117860794067,
+      "learning_rate": 0.0002,
+      "loss": 0.9797,
+      "step": 1640
+    },
+    {
+      "epoch": 1.3801756587202008,
+      "grad_norm": 1.0685369968414307,
+      "learning_rate": 0.0002,
+      "loss": 0.8953,
+      "step": 1650
+    },
+    {
+      "epoch": 1.3885403596821413,
+      "grad_norm": 1.1614553928375244,
+      "learning_rate": 0.0002,
+      "loss": 0.9533,
+      "step": 1660
+    },
+    {
+      "epoch": 1.396905060644082,
+      "grad_norm": 1.2501142024993896,
+      "learning_rate": 0.0002,
+      "loss": 1.0274,
+      "step": 1670
+    },
+    {
+      "epoch": 1.4052697616060226,
+      "grad_norm": 1.0739696025848389,
+      "learning_rate": 0.0002,
+      "loss": 0.9498,
+      "step": 1680
+    },
+    {
+      "epoch": 1.413634462567963,
+      "grad_norm": 0.800770103931427,
+      "learning_rate": 0.0002,
+      "loss": 1.0329,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4219991635299039,
+      "grad_norm": 0.6980189085006714,
+      "learning_rate": 0.0002,
+      "loss": 1.1194,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4303638644918444,
+      "grad_norm": 0.9088300466537476,
+      "learning_rate": 0.0002,
+      "loss": 0.9536,
+      "step": 1710
+    },
+    {
+      "epoch": 1.438728565453785,
+      "grad_norm": 1.0146790742874146,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4470932664157257,
+      "grad_norm": 2.0795905590057373,
+      "learning_rate": 0.0002,
+      "loss": 1.1158,
+      "step": 1730
+    },
+    {
+      "epoch": 1.4554579673776662,
+      "grad_norm": 0.7743622064590454,
+      "learning_rate": 0.0002,
+      "loss": 0.9421,
+      "step": 1740
+    },
+    {
+      "epoch": 1.4638226683396067,
+      "grad_norm": 0.9682395458221436,
+      "learning_rate": 0.0002,
+      "loss": 1.0351,
+      "step": 1750
+    },
+    {
+      "epoch": 1.4721873693015475,
+      "grad_norm": 0.905489981174469,
+      "learning_rate": 0.0002,
+      "loss": 1.0185,
+      "step": 1760
+    },
+    {
+      "epoch": 1.480552070263488,
+      "grad_norm": 1.1918401718139648,
+      "learning_rate": 0.0002,
+      "loss": 1.0104,
+      "step": 1770
+    },
+    {
+      "epoch": 1.4889167712254288,
+      "grad_norm": 0.5931059122085571,
+      "learning_rate": 0.0002,
+      "loss": 0.9078,
+      "step": 1780
+    },
+    {
+      "epoch": 1.4972814721873693,
+      "grad_norm": 1.197264552116394,
+      "learning_rate": 0.0002,
+      "loss": 0.9916,
+      "step": 1790
+    },
+    {
+      "epoch": 1.50564617314931,
+      "grad_norm": 1.4029070138931274,
+      "learning_rate": 0.0002,
+      "loss": 0.9754,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5140108741112506,
+      "grad_norm": 0.8593041896820068,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1810
+    },
+    {
+      "epoch": 1.522375575073191,
+      "grad_norm": 0.750442624092102,
+      "learning_rate": 0.0002,
+      "loss": 1.0252,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5307402760351319,
+      "grad_norm": 0.7551209330558777,
+      "learning_rate": 0.0002,
+      "loss": 0.9184,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5391049769970724,
+      "grad_norm": 0.7432758808135986,
+      "learning_rate": 0.0002,
+      "loss": 0.9508,
+      "step": 1840
+    },
+    {
+      "epoch": 1.547469677959013,
+      "grad_norm": 1.0624628067016602,
+      "learning_rate": 0.0002,
+      "loss": 1.0975,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5558343789209537,
+      "grad_norm": 0.8789014220237732,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1860
+    },
+    {
+      "epoch": 1.5641990798828942,
+      "grad_norm": 0.7802485823631287,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.5725637808448347,
+      "grad_norm": 1.129615306854248,
+      "learning_rate": 0.0002,
+      "loss": 0.922,
+      "step": 1880
+    },
+    {
+      "epoch": 1.5809284818067755,
+      "grad_norm": 1.0759961605072021,
+      "learning_rate": 0.0002,
+      "loss": 0.9252,
+      "step": 1890
+    },
+    {
+      "epoch": 1.589293182768716,
+      "grad_norm": 1.0037081241607666,
+      "learning_rate": 0.0002,
+      "loss": 0.9473,
+      "step": 1900
+    },
+    {
+      "epoch": 1.5976578837306565,
+      "grad_norm": 0.6003720164299011,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6060225846925973,
+      "grad_norm": 0.7846575975418091,
+      "learning_rate": 0.0002,
+      "loss": 0.9492,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6143872856545378,
+      "grad_norm": 0.9737453460693359,
+      "learning_rate": 0.0002,
+      "loss": 1.0247,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6227519866164783,
+      "grad_norm": 0.9219926595687866,
+      "learning_rate": 0.0002,
+      "loss": 0.9906,
+      "step": 1940
+    },
+    {
+      "epoch": 1.631116687578419,
+      "grad_norm": 0.7196545004844666,
+      "learning_rate": 0.0002,
+      "loss": 0.9447,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6394813885403596,
+      "grad_norm": 0.9171157479286194,
+      "learning_rate": 0.0002,
+      "loss": 1.0166,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6478460895023002,
+      "grad_norm": 0.9991112351417542,
+      "learning_rate": 0.0002,
+      "loss": 0.9248,
+      "step": 1970
+    },
+    {
+      "epoch": 1.656210790464241,
+      "grad_norm": 1.3650590181350708,
+      "learning_rate": 0.0002,
+      "loss": 0.9775,
+      "step": 1980
+    },
+    {
+      "epoch": 1.6645754914261817,
+      "grad_norm": 0.9693202376365662,
+      "learning_rate": 0.0002,
+      "loss": 0.9501,
+      "step": 1990
+    },
+    {
+      "epoch": 1.672940192388122,
+      "grad_norm": 0.9004108309745789,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 2000
+    },
+    {
+      "epoch": 1.6813048933500627,
+      "grad_norm": 1.3959358930587769,
+      "learning_rate": 0.0002,
+      "loss": 1.0098,
+      "step": 2010
+    },
+    {
+      "epoch": 1.6896695943120035,
+      "grad_norm": 1.6159738302230835,
+      "learning_rate": 0.0002,
+      "loss": 1.0145,
+      "step": 2020
+    },
+    {
+      "epoch": 1.698034295273944,
+      "grad_norm": 1.1095340251922607,
+      "learning_rate": 0.0002,
+      "loss": 0.9109,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7063989962358845,
+      "grad_norm": 0.9950175881385803,
+      "learning_rate": 0.0002,
+      "loss": 0.9436,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7147636971978253,
+      "grad_norm": 0.8590125441551208,
+      "learning_rate": 0.0002,
+      "loss": 1.0235,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7231283981597658,
+      "grad_norm": 0.7302223443984985,
+      "learning_rate": 0.0002,
+      "loss": 0.9384,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7314930991217063,
+      "grad_norm": 1.0173848867416382,
+      "learning_rate": 0.0002,
+      "loss": 1.0449,
+      "step": 2070
+    },
+    {
+      "epoch": 1.739857800083647,
+      "grad_norm": 0.6308056712150574,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 2080
+    },
+    {
+      "epoch": 1.7482225010455876,
+      "grad_norm": 1.2122596502304077,
+      "learning_rate": 0.0002,
+      "loss": 0.9637,
+      "step": 2090
+    },
+    {
+      "epoch": 1.7565872020075282,
+      "grad_norm": 1.2666280269622803,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 2100
+    },
+    {
+      "epoch": 1.764951902969469,
+      "grad_norm": 1.310709834098816,
+      "learning_rate": 0.0002,
+      "loss": 0.9263,
+      "step": 2110
+    },
+    {
+      "epoch": 1.7733166039314094,
+      "grad_norm": 0.8790634870529175,
+      "learning_rate": 0.0002,
+      "loss": 0.9711,
+      "step": 2120
+    },
+    {
+      "epoch": 1.78168130489335,
+      "grad_norm": 0.8222663998603821,
+      "learning_rate": 0.0002,
+      "loss": 0.9456,
+      "step": 2130
+    },
+    {
+      "epoch": 1.7900460058552907,
+      "grad_norm": 0.6637442708015442,
+      "learning_rate": 0.0002,
+      "loss": 0.9014,
+      "step": 2140
+    },
+    {
+      "epoch": 1.7984107068172313,
+      "grad_norm": 1.2613177299499512,
+      "learning_rate": 0.0002,
+      "loss": 1.0201,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8067754077791718,
+      "grad_norm": 0.6381147503852844,
+      "learning_rate": 0.0002,
+      "loss": 0.8389,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8151401087411125,
+      "grad_norm": 1.5663173198699951,
+      "learning_rate": 0.0002,
+      "loss": 0.9553,
+      "step": 2170
+    },
+    {
+      "epoch": 1.823504809703053,
+      "grad_norm": 0.8651582598686218,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8318695106649936,
+      "grad_norm": 0.7086225152015686,
+      "learning_rate": 0.0002,
+      "loss": 0.8555,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8402342116269343,
+      "grad_norm": 1.0986076593399048,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 2200
+    },
+    {
+      "epoch": 1.848598912588875,
+      "grad_norm": 1.0471370220184326,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2210
+    },
+    {
+      "epoch": 1.8569636135508154,
+      "grad_norm": 0.8230622410774231,
+      "learning_rate": 0.0002,
+      "loss": 1.0301,
+      "step": 2220
+    },
+    {
+      "epoch": 1.8653283145127562,
+      "grad_norm": 1.093545913696289,
+      "learning_rate": 0.0002,
+      "loss": 0.9191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.873693015474697,
+      "grad_norm": 0.8182677626609802,
+      "learning_rate": 0.0002,
+      "loss": 0.8441,
+      "step": 2240
+    },
+    {
+      "epoch": 1.8820577164366374,
+      "grad_norm": 0.9356469511985779,
+      "learning_rate": 0.0002,
+      "loss": 0.901,
+      "step": 2250
+    },
+    {
+      "epoch": 1.890422417398578,
+      "grad_norm": 0.8871003985404968,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 2260
+    },
+    {
+      "epoch": 1.8987871183605187,
+      "grad_norm": 1.0431411266326904,
+      "learning_rate": 0.0002,
+      "loss": 1.0625,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9071518193224593,
+      "grad_norm": 1.3339753150939941,
+      "learning_rate": 0.0002,
+      "loss": 0.9897,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9155165202843998,
+      "grad_norm": 0.9365147352218628,
+      "learning_rate": 0.0002,
+      "loss": 0.9742,
+      "step": 2290
+    },
+    {
+      "epoch": 1.9238812212463405,
+      "grad_norm": 0.721367359161377,
+      "learning_rate": 0.0002,
+      "loss": 0.9071,
+      "step": 2300
+    },
+    {
+      "epoch": 1.932245922208281,
+      "grad_norm": 1.0150835514068604,
+      "learning_rate": 0.0002,
+      "loss": 0.9101,
+      "step": 2310
+    },
+    {
+      "epoch": 1.9406106231702216,
+      "grad_norm": 0.7709364891052246,
+      "learning_rate": 0.0002,
+      "loss": 0.9583,
+      "step": 2320
+    },
+    {
+      "epoch": 1.9489753241321623,
+      "grad_norm": 1.035475254058838,
+      "learning_rate": 0.0002,
+      "loss": 0.9725,
+      "step": 2330
+    },
+    {
+      "epoch": 1.9573400250941029,
+      "grad_norm": 1.641360878944397,
+      "learning_rate": 0.0002,
+      "loss": 1.0028,
+      "step": 2340
+    },
+    {
+      "epoch": 1.9657047260560434,
+      "grad_norm": 1.6609785556793213,
+      "learning_rate": 0.0002,
+      "loss": 0.9478,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9740694270179842,
+      "grad_norm": 1.0421160459518433,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2360
+    },
+    {
+      "epoch": 1.9824341279799247,
+      "grad_norm": 0.5951679944992065,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 2370
+    },
+    {
+      "epoch": 1.9907988289418652,
+      "grad_norm": 1.2476773262023926,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 2380
+    },
+    {
+      "epoch": 1.999163529903806,
+      "grad_norm": 1.0104742050170898,
+      "learning_rate": 0.0002,
+      "loss": 0.9474,
+      "step": 2390
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1764153242111206,
+      "eval_runtime": 82.769,
+      "eval_samples_per_second": 5.509,
+      "eval_steps_per_second": 0.689,
+      "step": 2391
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9560,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.049044715569152e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82cc1a869a13836981e3f5a21d92f839005da543aa938bca6e96fe51edb97f77
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:583e4c6954fd20db8192d2ebbf943d1a59a30150fd33a83cd4125aef2d2aa295
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d063a6eee1535cfc3e94af034408d55f30c4cf0fbcd081ec38f2b7420b8a21e6
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04b272462b737b16ee3c083b0320d130d666d4ea2cb935ad88d29662b604c473
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f883512f89181f53b1aeae38cee90cb26feae098ddbbe2bd8c5aa086657ed9ed
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2563 @@

+{
+  "best_metric": 1.1764153242111206,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391",
+  "epoch": 2.9995817649519028,
+  "eval_steps": 10,
+  "global_step": 3586,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008364700961940611,
+      "grad_norm": 1.2661447525024414,
+      "learning_rate": 0.0002,
+      "loss": 1.9268,
+      "step": 10
+    },
+    {
+      "epoch": 0.016729401923881223,
+      "grad_norm": 1.3240571022033691,
+      "learning_rate": 0.0002,
+      "loss": 1.6326,
+      "step": 20
+    },
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 0.7347124218940735,
+      "learning_rate": 0.0002,
+      "loss": 1.507,
+      "step": 30
+    },
+    {
+      "epoch": 0.033458803847762446,
+      "grad_norm": 0.9849194288253784,
+      "learning_rate": 0.0002,
+      "loss": 1.5363,
+      "step": 40
+    },
+    {
+      "epoch": 0.04182350480970305,
+      "grad_norm": 0.9093025326728821,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 50
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 0.737514078617096,
+      "learning_rate": 0.0002,
+      "loss": 1.2542,
+      "step": 60
+    },
+    {
+      "epoch": 0.05855290673358427,
+      "grad_norm": 1.3245333433151245,
+      "learning_rate": 0.0002,
+      "loss": 1.2172,
+      "step": 70
+    },
+    {
+      "epoch": 0.06691760769552489,
+      "grad_norm": 0.7806007862091064,
+      "learning_rate": 0.0002,
+      "loss": 1.2478,
+      "step": 80
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 0.6627500057220459,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 90
+    },
+    {
+      "epoch": 0.0836470096194061,
+      "grad_norm": 1.0278682708740234,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 100
+    },
+    {
+      "epoch": 0.09201171058134672,
+      "grad_norm": 0.7746474146842957,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 110
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 0.5935637950897217,
+      "learning_rate": 0.0002,
+      "loss": 1.1171,
+      "step": 120
+    },
+    {
+      "epoch": 0.10874111250522794,
+      "grad_norm": 0.6738003492355347,
+      "learning_rate": 0.0002,
+      "loss": 1.1625,
+      "step": 130
+    },
+    {
+      "epoch": 0.11710581346716854,
+      "grad_norm": 0.6447349190711975,
+      "learning_rate": 0.0002,
+      "loss": 1.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 0.5628684759140015,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 150
+    },
+    {
+      "epoch": 0.13383521539104978,
+      "grad_norm": 0.7446871399879456,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 160
+    },
+    {
+      "epoch": 0.14219991635299037,
+      "grad_norm": 0.5214279294013977,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 170
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 0.5324464440345764,
+      "learning_rate": 0.0002,
+      "loss": 1.2612,
+      "step": 180
+    },
+    {
+      "epoch": 0.1589293182768716,
+      "grad_norm": 0.5539828538894653,
+      "learning_rate": 0.0002,
+      "loss": 1.3168,
+      "step": 190
+    },
+    {
+      "epoch": 0.1672940192388122,
+      "grad_norm": 0.5192331671714783,
+      "learning_rate": 0.0002,
+      "loss": 1.0835,
+      "step": 200
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 0.7160783410072327,
+      "learning_rate": 0.0002,
+      "loss": 1.1799,
+      "step": 210
+    },
+    {
+      "epoch": 0.18402342116269343,
+      "grad_norm": 0.8428353071212769,
+      "learning_rate": 0.0002,
+      "loss": 1.1527,
+      "step": 220
+    },
+    {
+      "epoch": 0.19238812212463405,
+      "grad_norm": 0.493561714887619,
+      "learning_rate": 0.0002,
+      "loss": 1.1284,
+      "step": 230
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 2.522308111190796,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 240
+    },
+    {
+      "epoch": 0.20911752404851527,
+      "grad_norm": 0.7338423728942871,
+      "learning_rate": 0.0002,
+      "loss": 1.1459,
+      "step": 250
+    },
+    {
+      "epoch": 0.2174822250104559,
+      "grad_norm": 0.6501832604408264,
+      "learning_rate": 0.0002,
+      "loss": 1.1311,
+      "step": 260
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.6331472992897034,
+      "learning_rate": 0.0002,
+      "loss": 1.2241,
+      "step": 270
+    },
+    {
+      "epoch": 0.23421162693433709,
+      "grad_norm": 0.5653548836708069,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 280
+    },
+    {
+      "epoch": 0.2425763278962777,
+      "grad_norm": 0.5833444595336914,
+      "learning_rate": 0.0002,
+      "loss": 1.119,
+      "step": 290
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.6707335114479065,
+      "learning_rate": 0.0002,
+      "loss": 1.2157,
+      "step": 300
+    },
+    {
+      "epoch": 0.2593057298201589,
+      "grad_norm": 0.5435659885406494,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 310
+    },
+    {
+      "epoch": 0.26767043078209957,
+      "grad_norm": 0.5752334594726562,
+      "learning_rate": 0.0002,
+      "loss": 1.0781,
+      "step": 320
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 0.5790163278579712,
+      "learning_rate": 0.0002,
+      "loss": 1.0493,
+      "step": 330
+    },
+    {
+      "epoch": 0.28439983270598074,
+      "grad_norm": 0.46593040227890015,
+      "learning_rate": 0.0002,
+      "loss": 1.2281,
+      "step": 340
+    },
+    {
+      "epoch": 0.2927645336679214,
+      "grad_norm": 0.7713788151741028,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 350
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 0.7719253301620483,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 360
+    },
+    {
+      "epoch": 0.3094939355918026,
+      "grad_norm": 0.7065562605857849,
+      "learning_rate": 0.0002,
+      "loss": 1.0884,
+      "step": 370
+    },
+    {
+      "epoch": 0.3178586365537432,
+      "grad_norm": 0.7082679271697998,
+      "learning_rate": 0.0002,
+      "loss": 1.0902,
+      "step": 380
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.5779536366462708,
+      "learning_rate": 0.0002,
+      "loss": 1.1696,
+      "step": 390
+    },
+    {
+      "epoch": 0.3345880384776244,
+      "grad_norm": 0.6321173310279846,
+      "learning_rate": 0.0002,
+      "loss": 1.1916,
+      "step": 400
+    },
+    {
+      "epoch": 0.34295273943956506,
+      "grad_norm": 0.7237968444824219,
+      "learning_rate": 0.0002,
+      "loss": 1.1419,
+      "step": 410
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 0.6730817556381226,
+      "learning_rate": 0.0002,
+      "loss": 0.9877,
+      "step": 420
+    },
+    {
+      "epoch": 0.3596821413634463,
+      "grad_norm": 0.6245285868644714,
+      "learning_rate": 0.0002,
+      "loss": 1.108,
+      "step": 430
+    },
+    {
+      "epoch": 0.36804684232538687,
+      "grad_norm": 0.9926134347915649,
+      "learning_rate": 0.0002,
+      "loss": 1.209,
+      "step": 440
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 0.5567468404769897,
+      "learning_rate": 0.0002,
+      "loss": 1.0664,
+      "step": 450
+    },
+    {
+      "epoch": 0.3847762442492681,
+      "grad_norm": 0.5764540433883667,
+      "learning_rate": 0.0002,
+      "loss": 1.1838,
+      "step": 460
+    },
+    {
+      "epoch": 0.3931409452112087,
+      "grad_norm": 1.1908321380615234,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 470
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 0.6756157875061035,
+      "learning_rate": 0.0002,
+      "loss": 1.1601,
+      "step": 480
+    },
+    {
+      "epoch": 0.4098703471350899,
+      "grad_norm": 0.5793355107307434,
+      "learning_rate": 0.0002,
+      "loss": 1.1703,
+      "step": 490
+    },
+    {
+      "epoch": 0.41823504809703055,
+      "grad_norm": 0.6145297288894653,
+      "learning_rate": 0.0002,
+      "loss": 1.1289,
+      "step": 500
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 0.48073795437812805,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 510
+    },
+    {
+      "epoch": 0.4349644500209118,
+      "grad_norm": 0.802431046962738,
+      "learning_rate": 0.0002,
+      "loss": 1.1335,
+      "step": 520
+    },
+    {
+      "epoch": 0.44332915098285236,
+      "grad_norm": 0.5906000137329102,
+      "learning_rate": 0.0002,
+      "loss": 1.0574,
+      "step": 530
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5615521669387817,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 540
+    },
+    {
+      "epoch": 0.4600585529067336,
+      "grad_norm": 0.5688650012016296,
+      "learning_rate": 0.0002,
+      "loss": 1.2228,
+      "step": 550
+    },
+    {
+      "epoch": 0.46842325386867417,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 560
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.6905680298805237,
+      "learning_rate": 0.0002,
+      "loss": 1.1566,
+      "step": 570
+    },
+    {
+      "epoch": 0.4851526557925554,
+      "grad_norm": 0.5885183811187744,
+      "learning_rate": 0.0002,
+      "loss": 1.1256,
+      "step": 580
+    },
+    {
+      "epoch": 0.49351735675449604,
+      "grad_norm": 0.7367458343505859,
+      "learning_rate": 0.0002,
+      "loss": 1.211,
+      "step": 590
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 0.9157859086990356,
+      "learning_rate": 0.0002,
+      "loss": 1.1215,
+      "step": 600
+    },
+    {
+      "epoch": 0.5102467586783772,
+      "grad_norm": 0.49971529841423035,
+      "learning_rate": 0.0002,
+      "loss": 1.3101,
+      "step": 610
+    },
+    {
+      "epoch": 0.5186114596403179,
+      "grad_norm": 0.5031328797340393,
+      "learning_rate": 0.0002,
+      "loss": 1.1223,
+      "step": 620
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 0.6945798397064209,
+      "learning_rate": 0.0002,
+      "loss": 1.154,
+      "step": 630
+    },
+    {
+      "epoch": 0.5353408615641991,
+      "grad_norm": 0.7563218474388123,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 640
+    },
+    {
+      "epoch": 0.5437055625261397,
+      "grad_norm": 0.9215132594108582,
+      "learning_rate": 0.0002,
+      "loss": 1.2364,
+      "step": 650
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 1.0132478475570679,
+      "learning_rate": 0.0002,
+      "loss": 1.2179,
+      "step": 660
+    },
+    {
+      "epoch": 0.560434964450021,
+      "grad_norm": 1.448024868965149,
+      "learning_rate": 0.0002,
+      "loss": 1.1016,
+      "step": 670
+    },
+    {
+      "epoch": 0.5687996654119615,
+      "grad_norm": 0.7022866010665894,
+      "learning_rate": 0.0002,
+      "loss": 1.1918,
+      "step": 680
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 0.7366224527359009,
+      "learning_rate": 0.0002,
+      "loss": 1.1108,
+      "step": 690
+    },
+    {
+      "epoch": 0.5855290673358428,
+      "grad_norm": 0.722874641418457,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 700
+    },
+    {
+      "epoch": 0.5938937682977834,
+      "grad_norm": 1.0756473541259766,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 710
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.607101559638977,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 720
+    },
+    {
+      "epoch": 0.6106231702216646,
+      "grad_norm": 0.7424359917640686,
+      "learning_rate": 0.0002,
+      "loss": 1.1561,
+      "step": 730
+    },
+    {
+      "epoch": 0.6189878711836052,
+      "grad_norm": 0.7123169898986816,
+      "learning_rate": 0.0002,
+      "loss": 1.1124,
+      "step": 740
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 0.672195315361023,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 750
+    },
+    {
+      "epoch": 0.6357172731074864,
+      "grad_norm": 0.8329780697822571,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 760
+    },
+    {
+      "epoch": 0.644081974069427,
+      "grad_norm": 0.7011522650718689,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 770
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 0.6425889730453491,
+      "learning_rate": 0.0002,
+      "loss": 1.2505,
+      "step": 780
+    },
+    {
+      "epoch": 0.6608113759933082,
+      "grad_norm": 0.8729137182235718,
+      "learning_rate": 0.0002,
+      "loss": 1.2005,
+      "step": 790
+    },
+    {
+      "epoch": 0.6691760769552488,
+      "grad_norm": 0.5885024070739746,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 800
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.526979386806488,
+      "learning_rate": 0.0002,
+      "loss": 1.1901,
+      "step": 810
+    },
+    {
+      "epoch": 0.6859054788791301,
+      "grad_norm": 0.998365044593811,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 820
+    },
+    {
+      "epoch": 0.6942701798410706,
+      "grad_norm": 0.6049501299858093,
+      "learning_rate": 0.0002,
+      "loss": 1.0278,
+      "step": 830
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 0.7015583515167236,
+      "learning_rate": 0.0002,
+      "loss": 1.1102,
+      "step": 840
+    },
+    {
+      "epoch": 0.7109995817649519,
+      "grad_norm": 0.5852547883987427,
+      "learning_rate": 0.0002,
+      "loss": 1.1041,
+      "step": 850
+    },
+    {
+      "epoch": 0.7193642827268926,
+      "grad_norm": 0.6017204523086548,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 860
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.7195692658424377,
+      "learning_rate": 0.0002,
+      "loss": 1.0611,
+      "step": 870
+    },
+    {
+      "epoch": 0.7360936846507737,
+      "grad_norm": 0.8087519407272339,
+      "learning_rate": 0.0002,
+      "loss": 1.1497,
+      "step": 880
+    },
+    {
+      "epoch": 0.7444583856127144,
+      "grad_norm": 0.988362193107605,
+      "learning_rate": 0.0002,
+      "loss": 1.1087,
+      "step": 890
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.6142330765724182,
+      "learning_rate": 0.0002,
+      "loss": 1.049,
+      "step": 900
+    },
+    {
+      "epoch": 0.7611877875365956,
+      "grad_norm": 0.6751818656921387,
+      "learning_rate": 0.0002,
+      "loss": 1.0388,
+      "step": 910
+    },
+    {
+      "epoch": 0.7695524884985362,
+      "grad_norm": 0.7528653740882874,
+      "learning_rate": 0.0002,
+      "loss": 1.2125,
+      "step": 920
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.613039493560791,
+      "learning_rate": 0.0002,
+      "loss": 0.9926,
+      "step": 930
+    },
+    {
+      "epoch": 0.7862818904224174,
+      "grad_norm": 0.8040242791175842,
+      "learning_rate": 0.0002,
+      "loss": 1.2582,
+      "step": 940
+    },
+    {
+      "epoch": 0.794646591384358,
+      "grad_norm": 0.5306838154792786,
+      "learning_rate": 0.0002,
+      "loss": 1.1397,
+      "step": 950
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.7037438750267029,
+      "learning_rate": 0.0002,
+      "loss": 1.0303,
+      "step": 960
+    },
+    {
+      "epoch": 0.8113759933082393,
+      "grad_norm": 0.6726985573768616,
+      "learning_rate": 0.0002,
+      "loss": 1.1531,
+      "step": 970
+    },
+    {
+      "epoch": 0.8197406942701798,
+      "grad_norm": 0.9324426651000977,
+      "learning_rate": 0.0002,
+      "loss": 1.125,
+      "step": 980
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.5811492204666138,
+      "learning_rate": 0.0002,
+      "loss": 1.0744,
+      "step": 990
+    },
+    {
+      "epoch": 0.8364700961940611,
+      "grad_norm": 0.6894899606704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8448347971560016,
+      "grad_norm": 0.5663559436798096,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 0.5555400252342224,
+      "learning_rate": 0.0002,
+      "loss": 1.0337,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8615641990798829,
+      "grad_norm": 0.4418621063232422,
+      "learning_rate": 0.0002,
+      "loss": 1.1086,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8699289000418235,
+      "grad_norm": 0.7832980751991272,
+      "learning_rate": 0.0002,
+      "loss": 1.1291,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.6883782744407654,
+      "learning_rate": 0.0002,
+      "loss": 1.1538,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8866583019657047,
+      "grad_norm": 0.5617508888244629,
+      "learning_rate": 0.0002,
+      "loss": 1.0311,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8950230029276454,
+      "grad_norm": 0.723233699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1869,
+      "step": 1070
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 2.8922297954559326,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9117524048515265,
+      "grad_norm": 1.5861668586730957,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9201171058134672,
+      "grad_norm": 0.6625565886497498,
+      "learning_rate": 0.0002,
+      "loss": 1.0758,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.6424002647399902,
+      "learning_rate": 0.0002,
+      "loss": 1.2524,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9368465077373483,
+      "grad_norm": 0.7253570556640625,
+      "learning_rate": 0.0002,
+      "loss": 1.0261,
+      "step": 1120
+    },
+    {
+      "epoch": 0.945211208699289,
+      "grad_norm": 0.6529237627983093,
+      "learning_rate": 0.0002,
+      "loss": 1.2131,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.7082931399345398,
+      "learning_rate": 0.0002,
+      "loss": 1.0705,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9619406106231703,
+      "grad_norm": 1.10663902759552,
+      "learning_rate": 0.0002,
+      "loss": 1.2197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9703053115851108,
+      "grad_norm": 0.6979895830154419,
+      "learning_rate": 0.0002,
+      "loss": 1.1051,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.896873950958252,
+      "learning_rate": 0.0002,
+      "loss": 1.1516,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9870347135089921,
+      "grad_norm": 0.5664224624633789,
+      "learning_rate": 0.0002,
+      "loss": 1.0224,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9953994144709327,
+      "grad_norm": 0.6827336549758911,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 1190
+    },
+    {
+      "epoch": 0.999581764951903,
+      "eval_loss": 1.1974399089813232,
+      "eval_runtime": 83.0008,
+      "eval_samples_per_second": 5.494,
+      "eval_steps_per_second": 0.687,
+      "step": 1195
+    },
+    {
+      "epoch": 1.0037641154328734,
+      "grad_norm": 0.5443172454833984,
+      "learning_rate": 0.0002,
+      "loss": 1.0743,
+      "step": 1200
+    },
+    {
+      "epoch": 1.012128816394814,
+      "grad_norm": 0.71578449010849,
+      "learning_rate": 0.0002,
+      "loss": 0.9491,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0204935173567544,
+      "grad_norm": 0.681245744228363,
+      "learning_rate": 0.0002,
+      "loss": 0.8987,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0288582183186952,
+      "grad_norm": 0.5959660410881042,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0372229192806357,
+      "grad_norm": 0.581801176071167,
+      "learning_rate": 0.0002,
+      "loss": 0.9537,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0455876202425762,
+      "grad_norm": 0.6427032947540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9291,
+      "step": 1250
+    },
+    {
+      "epoch": 1.053952321204517,
+      "grad_norm": 1.2949297428131104,
+      "learning_rate": 0.0002,
+      "loss": 1.0572,
+      "step": 1260
+    },
+    {
+      "epoch": 1.0623170221664575,
+      "grad_norm": 0.7161147594451904,
+      "learning_rate": 0.0002,
+      "loss": 0.876,
+      "step": 1270
+    },
+    {
+      "epoch": 1.070681723128398,
+      "grad_norm": 0.8515461087226868,
+      "learning_rate": 0.0002,
+      "loss": 0.9624,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0790464240903388,
+      "grad_norm": 0.9086605906486511,
+      "learning_rate": 0.0002,
+      "loss": 1.0332,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0874111250522793,
+      "grad_norm": 0.525374174118042,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1300
+    },
+    {
+      "epoch": 1.09577582601422,
+      "grad_norm": 0.6631740927696228,
+      "learning_rate": 0.0002,
+      "loss": 0.987,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1041405269761606,
+      "grad_norm": 0.8387110233306885,
+      "learning_rate": 0.0002,
+      "loss": 1.0077,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1125052279381011,
+      "grad_norm": 0.8402808308601379,
+      "learning_rate": 0.0002,
+      "loss": 1.0299,
+      "step": 1330
+    },
+    {
+      "epoch": 1.120869928900042,
+      "grad_norm": 0.6945340037345886,
+      "learning_rate": 0.0002,
+      "loss": 0.9625,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1292346298619824,
+      "grad_norm": 0.6942460536956787,
+      "learning_rate": 0.0002,
+      "loss": 0.9181,
+      "step": 1350
+    },
+    {
+      "epoch": 1.137599330823923,
+      "grad_norm": 0.7074856758117676,
+      "learning_rate": 0.0002,
+      "loss": 1.0279,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1459640317858637,
+      "grad_norm": 0.6957907676696777,
+      "learning_rate": 0.0002,
+      "loss": 0.9177,
+      "step": 1370
+    },
+    {
+      "epoch": 1.1543287327478042,
+      "grad_norm": 0.7241228818893433,
+      "learning_rate": 0.0002,
+      "loss": 1.0561,
+      "step": 1380
+    },
+    {
+      "epoch": 1.162693433709745,
+      "grad_norm": 1.2119261026382446,
+      "learning_rate": 0.0002,
+      "loss": 0.974,
+      "step": 1390
+    },
+    {
+      "epoch": 1.1710581346716855,
+      "grad_norm": 0.7284879684448242,
+      "learning_rate": 0.0002,
+      "loss": 0.9813,
+      "step": 1400
+    },
+    {
+      "epoch": 1.179422835633626,
+      "grad_norm": 0.702438473701477,
+      "learning_rate": 0.0002,
+      "loss": 0.9153,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1877875365955668,
+      "grad_norm": 0.9390414357185364,
+      "learning_rate": 0.0002,
+      "loss": 1.0409,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1961522375575073,
+      "grad_norm": 0.8179782629013062,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2045169385194479,
+      "grad_norm": 1.4885749816894531,
+      "learning_rate": 0.0002,
+      "loss": 1.0606,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2128816394813886,
+      "grad_norm": 0.868131697177887,
+      "learning_rate": 0.0002,
+      "loss": 0.9497,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2212463404433291,
+      "grad_norm": 0.8125514388084412,
+      "learning_rate": 0.0002,
+      "loss": 0.9398,
+      "step": 1460
+    },
+    {
+      "epoch": 1.2296110414052697,
+      "grad_norm": 0.633736789226532,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2379757423672104,
+      "grad_norm": 0.6061311364173889,
+      "learning_rate": 0.0002,
+      "loss": 0.9484,
+      "step": 1480
+    },
+    {
+      "epoch": 1.246340443329151,
+      "grad_norm": 0.6683570742607117,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2547051442910915,
+      "grad_norm": 0.6832399964332581,
+      "learning_rate": 0.0002,
+      "loss": 0.9645,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2630698452530322,
+      "grad_norm": 0.7690117955207825,
+      "learning_rate": 0.0002,
+      "loss": 0.9892,
+      "step": 1510
+    },
+    {
+      "epoch": 1.2714345462149728,
+      "grad_norm": 0.7987741231918335,
+      "learning_rate": 0.0002,
+      "loss": 1.0383,
+      "step": 1520
+    },
+    {
+      "epoch": 1.2797992471769133,
+      "grad_norm": 0.527604877948761,
+      "learning_rate": 0.0002,
+      "loss": 0.9531,
+      "step": 1530
+    },
+    {
+      "epoch": 1.288163948138854,
+      "grad_norm": 0.6243641376495361,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2965286491007946,
+      "grad_norm": 0.7621095776557922,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1550
+    },
+    {
+      "epoch": 1.3048933500627353,
+      "grad_norm": 0.7913159728050232,
+      "learning_rate": 0.0002,
+      "loss": 1.0546,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3132580510246759,
+      "grad_norm": 0.9507867693901062,
+      "learning_rate": 0.0002,
+      "loss": 0.9793,
+      "step": 1570
+    },
+    {
+      "epoch": 1.3216227519866166,
+      "grad_norm": 0.7301706075668335,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3299874529485571,
+      "grad_norm": 0.7653141021728516,
+      "learning_rate": 0.0002,
+      "loss": 1.0031,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3383521539104977,
+      "grad_norm": 0.6372700333595276,
+      "learning_rate": 0.0002,
+      "loss": 0.8704,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3467168548724384,
+      "grad_norm": 1.7866026163101196,
+      "learning_rate": 0.0002,
+      "loss": 1.0373,
+      "step": 1610
+    },
+    {
+      "epoch": 1.355081555834379,
+      "grad_norm": 0.6353244781494141,
+      "learning_rate": 0.0002,
+      "loss": 0.9118,
+      "step": 1620
+    },
+    {
+      "epoch": 1.3634462567963195,
+      "grad_norm": 0.7673062086105347,
+      "learning_rate": 0.0002,
+      "loss": 1.0048,
+      "step": 1630
+    },
+    {
+      "epoch": 1.3718109577582602,
+      "grad_norm": 1.1364117860794067,
+      "learning_rate": 0.0002,
+      "loss": 0.9797,
+      "step": 1640
+    },
+    {
+      "epoch": 1.3801756587202008,
+      "grad_norm": 1.0685369968414307,
+      "learning_rate": 0.0002,
+      "loss": 0.8953,
+      "step": 1650
+    },
+    {
+      "epoch": 1.3885403596821413,
+      "grad_norm": 1.1614553928375244,
+      "learning_rate": 0.0002,
+      "loss": 0.9533,
+      "step": 1660
+    },
+    {
+      "epoch": 1.396905060644082,
+      "grad_norm": 1.2501142024993896,
+      "learning_rate": 0.0002,
+      "loss": 1.0274,
+      "step": 1670
+    },
+    {
+      "epoch": 1.4052697616060226,
+      "grad_norm": 1.0739696025848389,
+      "learning_rate": 0.0002,
+      "loss": 0.9498,
+      "step": 1680
+    },
+    {
+      "epoch": 1.413634462567963,
+      "grad_norm": 0.800770103931427,
+      "learning_rate": 0.0002,
+      "loss": 1.0329,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4219991635299039,
+      "grad_norm": 0.6980189085006714,
+      "learning_rate": 0.0002,
+      "loss": 1.1194,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4303638644918444,
+      "grad_norm": 0.9088300466537476,
+      "learning_rate": 0.0002,
+      "loss": 0.9536,
+      "step": 1710
+    },
+    {
+      "epoch": 1.438728565453785,
+      "grad_norm": 1.0146790742874146,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4470932664157257,
+      "grad_norm": 2.0795905590057373,
+      "learning_rate": 0.0002,
+      "loss": 1.1158,
+      "step": 1730
+    },
+    {
+      "epoch": 1.4554579673776662,
+      "grad_norm": 0.7743622064590454,
+      "learning_rate": 0.0002,
+      "loss": 0.9421,
+      "step": 1740
+    },
+    {
+      "epoch": 1.4638226683396067,
+      "grad_norm": 0.9682395458221436,
+      "learning_rate": 0.0002,
+      "loss": 1.0351,
+      "step": 1750
+    },
+    {
+      "epoch": 1.4721873693015475,
+      "grad_norm": 0.905489981174469,
+      "learning_rate": 0.0002,
+      "loss": 1.0185,
+      "step": 1760
+    },
+    {
+      "epoch": 1.480552070263488,
+      "grad_norm": 1.1918401718139648,
+      "learning_rate": 0.0002,
+      "loss": 1.0104,
+      "step": 1770
+    },
+    {
+      "epoch": 1.4889167712254288,
+      "grad_norm": 0.5931059122085571,
+      "learning_rate": 0.0002,
+      "loss": 0.9078,
+      "step": 1780
+    },
+    {
+      "epoch": 1.4972814721873693,
+      "grad_norm": 1.197264552116394,
+      "learning_rate": 0.0002,
+      "loss": 0.9916,
+      "step": 1790
+    },
+    {
+      "epoch": 1.50564617314931,
+      "grad_norm": 1.4029070138931274,
+      "learning_rate": 0.0002,
+      "loss": 0.9754,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5140108741112506,
+      "grad_norm": 0.8593041896820068,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1810
+    },
+    {
+      "epoch": 1.522375575073191,
+      "grad_norm": 0.750442624092102,
+      "learning_rate": 0.0002,
+      "loss": 1.0252,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5307402760351319,
+      "grad_norm": 0.7551209330558777,
+      "learning_rate": 0.0002,
+      "loss": 0.9184,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5391049769970724,
+      "grad_norm": 0.7432758808135986,
+      "learning_rate": 0.0002,
+      "loss": 0.9508,
+      "step": 1840
+    },
+    {
+      "epoch": 1.547469677959013,
+      "grad_norm": 1.0624628067016602,
+      "learning_rate": 0.0002,
+      "loss": 1.0975,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5558343789209537,
+      "grad_norm": 0.8789014220237732,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1860
+    },
+    {
+      "epoch": 1.5641990798828942,
+      "grad_norm": 0.7802485823631287,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.5725637808448347,
+      "grad_norm": 1.129615306854248,
+      "learning_rate": 0.0002,
+      "loss": 0.922,
+      "step": 1880
+    },
+    {
+      "epoch": 1.5809284818067755,
+      "grad_norm": 1.0759961605072021,
+      "learning_rate": 0.0002,
+      "loss": 0.9252,
+      "step": 1890
+    },
+    {
+      "epoch": 1.589293182768716,
+      "grad_norm": 1.0037081241607666,
+      "learning_rate": 0.0002,
+      "loss": 0.9473,
+      "step": 1900
+    },
+    {
+      "epoch": 1.5976578837306565,
+      "grad_norm": 0.6003720164299011,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6060225846925973,
+      "grad_norm": 0.7846575975418091,
+      "learning_rate": 0.0002,
+      "loss": 0.9492,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6143872856545378,
+      "grad_norm": 0.9737453460693359,
+      "learning_rate": 0.0002,
+      "loss": 1.0247,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6227519866164783,
+      "grad_norm": 0.9219926595687866,
+      "learning_rate": 0.0002,
+      "loss": 0.9906,
+      "step": 1940
+    },
+    {
+      "epoch": 1.631116687578419,
+      "grad_norm": 0.7196545004844666,
+      "learning_rate": 0.0002,
+      "loss": 0.9447,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6394813885403596,
+      "grad_norm": 0.9171157479286194,
+      "learning_rate": 0.0002,
+      "loss": 1.0166,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6478460895023002,
+      "grad_norm": 0.9991112351417542,
+      "learning_rate": 0.0002,
+      "loss": 0.9248,
+      "step": 1970
+    },
+    {
+      "epoch": 1.656210790464241,
+      "grad_norm": 1.3650590181350708,
+      "learning_rate": 0.0002,
+      "loss": 0.9775,
+      "step": 1980
+    },
+    {
+      "epoch": 1.6645754914261817,
+      "grad_norm": 0.9693202376365662,
+      "learning_rate": 0.0002,
+      "loss": 0.9501,
+      "step": 1990
+    },
+    {
+      "epoch": 1.672940192388122,
+      "grad_norm": 0.9004108309745789,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 2000
+    },
+    {
+      "epoch": 1.6813048933500627,
+      "grad_norm": 1.3959358930587769,
+      "learning_rate": 0.0002,
+      "loss": 1.0098,
+      "step": 2010
+    },
+    {
+      "epoch": 1.6896695943120035,
+      "grad_norm": 1.6159738302230835,
+      "learning_rate": 0.0002,
+      "loss": 1.0145,
+      "step": 2020
+    },
+    {
+      "epoch": 1.698034295273944,
+      "grad_norm": 1.1095340251922607,
+      "learning_rate": 0.0002,
+      "loss": 0.9109,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7063989962358845,
+      "grad_norm": 0.9950175881385803,
+      "learning_rate": 0.0002,
+      "loss": 0.9436,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7147636971978253,
+      "grad_norm": 0.8590125441551208,
+      "learning_rate": 0.0002,
+      "loss": 1.0235,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7231283981597658,
+      "grad_norm": 0.7302223443984985,
+      "learning_rate": 0.0002,
+      "loss": 0.9384,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7314930991217063,
+      "grad_norm": 1.0173848867416382,
+      "learning_rate": 0.0002,
+      "loss": 1.0449,
+      "step": 2070
+    },
+    {
+      "epoch": 1.739857800083647,
+      "grad_norm": 0.6308056712150574,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 2080
+    },
+    {
+      "epoch": 1.7482225010455876,
+      "grad_norm": 1.2122596502304077,
+      "learning_rate": 0.0002,
+      "loss": 0.9637,
+      "step": 2090
+    },
+    {
+      "epoch": 1.7565872020075282,
+      "grad_norm": 1.2666280269622803,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 2100
+    },
+    {
+      "epoch": 1.764951902969469,
+      "grad_norm": 1.310709834098816,
+      "learning_rate": 0.0002,
+      "loss": 0.9263,
+      "step": 2110
+    },
+    {
+      "epoch": 1.7733166039314094,
+      "grad_norm": 0.8790634870529175,
+      "learning_rate": 0.0002,
+      "loss": 0.9711,
+      "step": 2120
+    },
+    {
+      "epoch": 1.78168130489335,
+      "grad_norm": 0.8222663998603821,
+      "learning_rate": 0.0002,
+      "loss": 0.9456,
+      "step": 2130
+    },
+    {
+      "epoch": 1.7900460058552907,
+      "grad_norm": 0.6637442708015442,
+      "learning_rate": 0.0002,
+      "loss": 0.9014,
+      "step": 2140
+    },
+    {
+      "epoch": 1.7984107068172313,
+      "grad_norm": 1.2613177299499512,
+      "learning_rate": 0.0002,
+      "loss": 1.0201,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8067754077791718,
+      "grad_norm": 0.6381147503852844,
+      "learning_rate": 0.0002,
+      "loss": 0.8389,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8151401087411125,
+      "grad_norm": 1.5663173198699951,
+      "learning_rate": 0.0002,
+      "loss": 0.9553,
+      "step": 2170
+    },
+    {
+      "epoch": 1.823504809703053,
+      "grad_norm": 0.8651582598686218,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8318695106649936,
+      "grad_norm": 0.7086225152015686,
+      "learning_rate": 0.0002,
+      "loss": 0.8555,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8402342116269343,
+      "grad_norm": 1.0986076593399048,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 2200
+    },
+    {
+      "epoch": 1.848598912588875,
+      "grad_norm": 1.0471370220184326,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2210
+    },
+    {
+      "epoch": 1.8569636135508154,
+      "grad_norm": 0.8230622410774231,
+      "learning_rate": 0.0002,
+      "loss": 1.0301,
+      "step": 2220
+    },
+    {
+      "epoch": 1.8653283145127562,
+      "grad_norm": 1.093545913696289,
+      "learning_rate": 0.0002,
+      "loss": 0.9191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.873693015474697,
+      "grad_norm": 0.8182677626609802,
+      "learning_rate": 0.0002,
+      "loss": 0.8441,
+      "step": 2240
+    },
+    {
+      "epoch": 1.8820577164366374,
+      "grad_norm": 0.9356469511985779,
+      "learning_rate": 0.0002,
+      "loss": 0.901,
+      "step": 2250
+    },
+    {
+      "epoch": 1.890422417398578,
+      "grad_norm": 0.8871003985404968,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 2260
+    },
+    {
+      "epoch": 1.8987871183605187,
+      "grad_norm": 1.0431411266326904,
+      "learning_rate": 0.0002,
+      "loss": 1.0625,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9071518193224593,
+      "grad_norm": 1.3339753150939941,
+      "learning_rate": 0.0002,
+      "loss": 0.9897,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9155165202843998,
+      "grad_norm": 0.9365147352218628,
+      "learning_rate": 0.0002,
+      "loss": 0.9742,
+      "step": 2290
+    },
+    {
+      "epoch": 1.9238812212463405,
+      "grad_norm": 0.721367359161377,
+      "learning_rate": 0.0002,
+      "loss": 0.9071,
+      "step": 2300
+    },
+    {
+      "epoch": 1.932245922208281,
+      "grad_norm": 1.0150835514068604,
+      "learning_rate": 0.0002,
+      "loss": 0.9101,
+      "step": 2310
+    },
+    {
+      "epoch": 1.9406106231702216,
+      "grad_norm": 0.7709364891052246,
+      "learning_rate": 0.0002,
+      "loss": 0.9583,
+      "step": 2320
+    },
+    {
+      "epoch": 1.9489753241321623,
+      "grad_norm": 1.035475254058838,
+      "learning_rate": 0.0002,
+      "loss": 0.9725,
+      "step": 2330
+    },
+    {
+      "epoch": 1.9573400250941029,
+      "grad_norm": 1.641360878944397,
+      "learning_rate": 0.0002,
+      "loss": 1.0028,
+      "step": 2340
+    },
+    {
+      "epoch": 1.9657047260560434,
+      "grad_norm": 1.6609785556793213,
+      "learning_rate": 0.0002,
+      "loss": 0.9478,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9740694270179842,
+      "grad_norm": 1.0421160459518433,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2360
+    },
+    {
+      "epoch": 1.9824341279799247,
+      "grad_norm": 0.5951679944992065,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 2370
+    },
+    {
+      "epoch": 1.9907988289418652,
+      "grad_norm": 1.2476773262023926,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 2380
+    },
+    {
+      "epoch": 1.999163529903806,
+      "grad_norm": 1.0104742050170898,
+      "learning_rate": 0.0002,
+      "loss": 0.9474,
+      "step": 2390
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1764153242111206,
+      "eval_runtime": 82.769,
+      "eval_samples_per_second": 5.509,
+      "eval_steps_per_second": 0.689,
+      "step": 2391
+    },
+    {
+      "epoch": 2.0075282308657467,
+      "grad_norm": 0.7234944105148315,
+      "learning_rate": 0.0002,
+      "loss": 0.8288,
+      "step": 2400
+    },
+    {
+      "epoch": 2.015892931827687,
+      "grad_norm": 0.9550215601921082,
+      "learning_rate": 0.0002,
+      "loss": 0.7429,
+      "step": 2410
+    },
+    {
+      "epoch": 2.024257632789628,
+      "grad_norm": 1.2027040719985962,
+      "learning_rate": 0.0002,
+      "loss": 0.7585,
+      "step": 2420
+    },
+    {
+      "epoch": 2.0326223337515685,
+      "grad_norm": 0.6850367784500122,
+      "learning_rate": 0.0002,
+      "loss": 0.787,
+      "step": 2430
+    },
+    {
+      "epoch": 2.040987034713509,
+      "grad_norm": 0.9690644145011902,
+      "learning_rate": 0.0002,
+      "loss": 0.7446,
+      "step": 2440
+    },
+    {
+      "epoch": 2.0493517356754496,
+      "grad_norm": 1.1414319276809692,
+      "learning_rate": 0.0002,
+      "loss": 0.7705,
+      "step": 2450
+    },
+    {
+      "epoch": 2.0577164366373903,
+      "grad_norm": 0.9193547964096069,
+      "learning_rate": 0.0002,
+      "loss": 0.7652,
+      "step": 2460
+    },
+    {
+      "epoch": 2.0660811375993307,
+      "grad_norm": 1.358508825302124,
+      "learning_rate": 0.0002,
+      "loss": 0.8316,
+      "step": 2470
+    },
+    {
+      "epoch": 2.0744458385612714,
+      "grad_norm": 0.863886296749115,
+      "learning_rate": 0.0002,
+      "loss": 0.8208,
+      "step": 2480
+    },
+    {
+      "epoch": 2.082810539523212,
+      "grad_norm": 1.2565582990646362,
+      "learning_rate": 0.0002,
+      "loss": 0.7898,
+      "step": 2490
+    },
+    {
+      "epoch": 2.0911752404851525,
+      "grad_norm": 1.3817089796066284,
+      "learning_rate": 0.0002,
+      "loss": 0.7448,
+      "step": 2500
+    },
+    {
+      "epoch": 2.099539941447093,
+      "grad_norm": 1.2113038301467896,
+      "learning_rate": 0.0002,
+      "loss": 0.7404,
+      "step": 2510
+    },
+    {
+      "epoch": 2.107904642409034,
+      "grad_norm": 0.7887806296348572,
+      "learning_rate": 0.0002,
+      "loss": 0.7189,
+      "step": 2520
+    },
+    {
+      "epoch": 2.1162693433709743,
+      "grad_norm": 0.8972041010856628,
+      "learning_rate": 0.0002,
+      "loss": 0.7905,
+      "step": 2530
+    },
+    {
+      "epoch": 2.124634044332915,
+      "grad_norm": 1.3625597953796387,
+      "learning_rate": 0.0002,
+      "loss": 0.6934,
+      "step": 2540
+    },
+    {
+      "epoch": 2.132998745294856,
+      "grad_norm": 1.819085955619812,
+      "learning_rate": 0.0002,
+      "loss": 0.7833,
+      "step": 2550
+    },
+    {
+      "epoch": 2.141363446256796,
+      "grad_norm": 0.965623140335083,
+      "learning_rate": 0.0002,
+      "loss": 0.6858,
+      "step": 2560
+    },
+    {
+      "epoch": 2.149728147218737,
+      "grad_norm": 0.8528746366500854,
+      "learning_rate": 0.0002,
+      "loss": 0.7498,
+      "step": 2570
+    },
+    {
+      "epoch": 2.1580928481806776,
+      "grad_norm": 0.8238094449043274,
+      "learning_rate": 0.0002,
+      "loss": 0.8329,
+      "step": 2580
+    },
+    {
+      "epoch": 2.1664575491426183,
+      "grad_norm": 0.9206092953681946,
+      "learning_rate": 0.0002,
+      "loss": 0.8329,
+      "step": 2590
+    },
+    {
+      "epoch": 2.1748222501045587,
+      "grad_norm": 1.3594036102294922,
+      "learning_rate": 0.0002,
+      "loss": 0.8068,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1831869510664994,
+      "grad_norm": 0.9997738599777222,
+      "learning_rate": 0.0002,
+      "loss": 0.717,
+      "step": 2610
+    },
+    {
+      "epoch": 2.19155165202844,
+      "grad_norm": 0.9230810403823853,
+      "learning_rate": 0.0002,
+      "loss": 0.8211,
+      "step": 2620
+    },
+    {
+      "epoch": 2.1999163529903805,
+      "grad_norm": 0.859367311000824,
+      "learning_rate": 0.0002,
+      "loss": 0.8048,
+      "step": 2630
+    },
+    {
+      "epoch": 2.208281053952321,
+      "grad_norm": 1.087170958518982,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 2640
+    },
+    {
+      "epoch": 2.216645754914262,
+      "grad_norm": 0.8764513731002808,
+      "learning_rate": 0.0002,
+      "loss": 0.7623,
+      "step": 2650
+    },
+    {
+      "epoch": 2.2250104558762023,
+      "grad_norm": 1.4553709030151367,
+      "learning_rate": 0.0002,
+      "loss": 0.7349,
+      "step": 2660
+    },
+    {
+      "epoch": 2.233375156838143,
+      "grad_norm": 0.8835197687149048,
+      "learning_rate": 0.0002,
+      "loss": 0.768,
+      "step": 2670
+    },
+    {
+      "epoch": 2.241739857800084,
+      "grad_norm": 3.089097023010254,
+      "learning_rate": 0.0002,
+      "loss": 0.8307,
+      "step": 2680
+    },
+    {
+      "epoch": 2.250104558762024,
+      "grad_norm": 1.1077880859375,
+      "learning_rate": 0.0002,
+      "loss": 0.7848,
+      "step": 2690
+    },
+    {
+      "epoch": 2.258469259723965,
+      "grad_norm": 0.99500972032547,
+      "learning_rate": 0.0002,
+      "loss": 0.8162,
+      "step": 2700
+    },
+    {
+      "epoch": 2.2668339606859056,
+      "grad_norm": 1.1205966472625732,
+      "learning_rate": 0.0002,
+      "loss": 0.8746,
+      "step": 2710
+    },
+    {
+      "epoch": 2.275198661647846,
+      "grad_norm": 1.661110520362854,
+      "learning_rate": 0.0002,
+      "loss": 0.7618,
+      "step": 2720
+    },
+    {
+      "epoch": 2.2835633626097867,
+      "grad_norm": 0.8378655910491943,
+      "learning_rate": 0.0002,
+      "loss": 0.7953,
+      "step": 2730
+    },
+    {
+      "epoch": 2.2919280635717274,
+      "grad_norm": 0.896804690361023,
+      "learning_rate": 0.0002,
+      "loss": 0.8631,
+      "step": 2740
+    },
+    {
+      "epoch": 2.300292764533668,
+      "grad_norm": 1.3855854272842407,
+      "learning_rate": 0.0002,
+      "loss": 0.8486,
+      "step": 2750
+    },
+    {
+      "epoch": 2.3086574654956085,
+      "grad_norm": 1.2021458148956299,
+      "learning_rate": 0.0002,
+      "loss": 0.7588,
+      "step": 2760
+    },
+    {
+      "epoch": 2.317022166457549,
+      "grad_norm": 0.899120032787323,
+      "learning_rate": 0.0002,
+      "loss": 0.7919,
+      "step": 2770
+    },
+    {
+      "epoch": 2.32538686741949,
+      "grad_norm": 1.828062653541565,
+      "learning_rate": 0.0002,
+      "loss": 0.7732,
+      "step": 2780
+    },
+    {
+      "epoch": 2.3337515683814303,
+      "grad_norm": 0.9991434216499329,
+      "learning_rate": 0.0002,
+      "loss": 0.8045,
+      "step": 2790
+    },
+    {
+      "epoch": 2.342116269343371,
+      "grad_norm": 1.0790653228759766,
+      "learning_rate": 0.0002,
+      "loss": 0.7822,
+      "step": 2800
+    },
+    {
+      "epoch": 2.350480970305312,
+      "grad_norm": 1.0230239629745483,
+      "learning_rate": 0.0002,
+      "loss": 0.8078,
+      "step": 2810
+    },
+    {
+      "epoch": 2.358845671267252,
+      "grad_norm": 1.5933716297149658,
+      "learning_rate": 0.0002,
+      "loss": 0.7708,
+      "step": 2820
+    },
+    {
+      "epoch": 2.367210372229193,
+      "grad_norm": 0.9329839944839478,
+      "learning_rate": 0.0002,
+      "loss": 0.8118,
+      "step": 2830
+    },
+    {
+      "epoch": 2.3755750731911336,
+      "grad_norm": 1.3260419368743896,
+      "learning_rate": 0.0002,
+      "loss": 0.9203,
+      "step": 2840
+    },
+    {
+      "epoch": 2.383939774153074,
+      "grad_norm": 1.5849716663360596,
+      "learning_rate": 0.0002,
+      "loss": 0.8069,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3923044751150147,
+      "grad_norm": 1.3823819160461426,
+      "learning_rate": 0.0002,
+      "loss": 0.7742,
+      "step": 2860
+    },
+    {
+      "epoch": 2.4006691760769554,
+      "grad_norm": 1.9762850999832153,
+      "learning_rate": 0.0002,
+      "loss": 0.7546,
+      "step": 2870
+    },
+    {
+      "epoch": 2.4090338770388957,
+      "grad_norm": 1.26347017288208,
+      "learning_rate": 0.0002,
+      "loss": 0.7905,
+      "step": 2880
+    },
+    {
+      "epoch": 2.4173985780008365,
+      "grad_norm": 1.3072547912597656,
+      "learning_rate": 0.0002,
+      "loss": 0.7065,
+      "step": 2890
+    },
+    {
+      "epoch": 2.425763278962777,
+      "grad_norm": 0.9560710787773132,
+      "learning_rate": 0.0002,
+      "loss": 0.787,
+      "step": 2900
+    },
+    {
+      "epoch": 2.4341279799247175,
+      "grad_norm": 1.218003749847412,
+      "learning_rate": 0.0002,
+      "loss": 0.7588,
+      "step": 2910
+    },
+    {
+      "epoch": 2.4424926808866583,
+      "grad_norm": 1.2331637144088745,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 2920
+    },
+    {
+      "epoch": 2.450857381848599,
+      "grad_norm": 0.8522817492485046,
+      "learning_rate": 0.0002,
+      "loss": 0.7584,
+      "step": 2930
+    },
+    {
+      "epoch": 2.4592220828105393,
+      "grad_norm": 1.088078260421753,
+      "learning_rate": 0.0002,
+      "loss": 0.8426,
+      "step": 2940
+    },
+    {
+      "epoch": 2.46758678377248,
+      "grad_norm": 1.8097302913665771,
+      "learning_rate": 0.0002,
+      "loss": 0.8165,
+      "step": 2950
+    },
+    {
+      "epoch": 2.475951484734421,
+      "grad_norm": 1.3536505699157715,
+      "learning_rate": 0.0002,
+      "loss": 0.7653,
+      "step": 2960
+    },
+    {
+      "epoch": 2.484316185696361,
+      "grad_norm": 1.1186577081680298,
+      "learning_rate": 0.0002,
+      "loss": 0.8158,
+      "step": 2970
+    },
+    {
+      "epoch": 2.492680886658302,
+      "grad_norm": 0.9760162234306335,
+      "learning_rate": 0.0002,
+      "loss": 0.8035,
+      "step": 2980
+    },
+    {
+      "epoch": 2.5010455876202426,
+      "grad_norm": 0.8375564217567444,
+      "learning_rate": 0.0002,
+      "loss": 0.7839,
+      "step": 2990
+    },
+    {
+      "epoch": 2.509410288582183,
+      "grad_norm": 1.2951868772506714,
+      "learning_rate": 0.0002,
+      "loss": 0.8406,
+      "step": 3000
+    },
+    {
+      "epoch": 2.5177749895441237,
+      "grad_norm": 1.4565622806549072,
+      "learning_rate": 0.0002,
+      "loss": 0.7518,
+      "step": 3010
+    },
+    {
+      "epoch": 2.5261396905060645,
+      "grad_norm": 1.3650271892547607,
+      "learning_rate": 0.0002,
+      "loss": 0.7281,
+      "step": 3020
+    },
+    {
+      "epoch": 2.5345043914680048,
+      "grad_norm": 0.9309256076812744,
+      "learning_rate": 0.0002,
+      "loss": 0.7661,
+      "step": 3030
+    },
+    {
+      "epoch": 2.5428690924299455,
+      "grad_norm": 1.07468581199646,
+      "learning_rate": 0.0002,
+      "loss": 0.8193,
+      "step": 3040
+    },
+    {
+      "epoch": 2.5512337933918863,
+      "grad_norm": 1.3080440759658813,
+      "learning_rate": 0.0002,
+      "loss": 0.7766,
+      "step": 3050
+    },
+    {
+      "epoch": 2.5595984943538266,
+      "grad_norm": 1.1031043529510498,
+      "learning_rate": 0.0002,
+      "loss": 0.7574,
+      "step": 3060
+    },
+    {
+      "epoch": 2.5679631953157673,
+      "grad_norm": 1.1987742185592651,
+      "learning_rate": 0.0002,
+      "loss": 0.8079,
+      "step": 3070
+    },
+    {
+      "epoch": 2.576327896277708,
+      "grad_norm": 1.0586541891098022,
+      "learning_rate": 0.0002,
+      "loss": 0.7845,
+      "step": 3080
+    },
+    {
+      "epoch": 2.584692597239649,
+      "grad_norm": 1.102643370628357,
+      "learning_rate": 0.0002,
+      "loss": 0.717,
+      "step": 3090
+    },
+    {
+      "epoch": 2.593057298201589,
+      "grad_norm": 0.9427294731140137,
+      "learning_rate": 0.0002,
+      "loss": 0.7393,
+      "step": 3100
+    },
+    {
+      "epoch": 2.60142199916353,
+      "grad_norm": 1.0983883142471313,
+      "learning_rate": 0.0002,
+      "loss": 0.7737,
+      "step": 3110
+    },
+    {
+      "epoch": 2.6097867001254706,
+      "grad_norm": 1.0884950160980225,
+      "learning_rate": 0.0002,
+      "loss": 0.8082,
+      "step": 3120
+    },
+    {
+      "epoch": 2.6181514010874114,
+      "grad_norm": 0.7318823933601379,
+      "learning_rate": 0.0002,
+      "loss": 0.8692,
+      "step": 3130
+    },
+    {
+      "epoch": 2.6265161020493517,
+      "grad_norm": 1.191433310508728,
+      "learning_rate": 0.0002,
+      "loss": 0.8717,
+      "step": 3140
+    },
+    {
+      "epoch": 2.6348808030112925,
+      "grad_norm": 1.2203903198242188,
+      "learning_rate": 0.0002,
+      "loss": 0.7807,
+      "step": 3150
+    },
+    {
+      "epoch": 2.643245503973233,
+      "grad_norm": 0.9203769564628601,
+      "learning_rate": 0.0002,
+      "loss": 0.8545,
+      "step": 3160
+    },
+    {
+      "epoch": 2.6516102049351735,
+      "grad_norm": 1.298902988433838,
+      "learning_rate": 0.0002,
+      "loss": 0.8506,
+      "step": 3170
+    },
+    {
+      "epoch": 2.6599749058971143,
+      "grad_norm": 1.1898419857025146,
+      "learning_rate": 0.0002,
+      "loss": 0.7738,
+      "step": 3180
+    },
+    {
+      "epoch": 2.668339606859055,
+      "grad_norm": 1.0493841171264648,
+      "learning_rate": 0.0002,
+      "loss": 0.7995,
+      "step": 3190
+    },
+    {
+      "epoch": 2.6767043078209953,
+      "grad_norm": 1.242353081703186,
+      "learning_rate": 0.0002,
+      "loss": 0.7892,
+      "step": 3200
+    },
+    {
+      "epoch": 2.685069008782936,
+      "grad_norm": 0.9045358896255493,
+      "learning_rate": 0.0002,
+      "loss": 0.7538,
+      "step": 3210
+    },
+    {
+      "epoch": 2.693433709744877,
+      "grad_norm": 0.8066250085830688,
+      "learning_rate": 0.0002,
+      "loss": 0.8079,
+      "step": 3220
+    },
+    {
+      "epoch": 2.701798410706817,
+      "grad_norm": 1.0554566383361816,
+      "learning_rate": 0.0002,
+      "loss": 0.8163,
+      "step": 3230
+    },
+    {
+      "epoch": 2.710163111668758,
+      "grad_norm": 1.2041901350021362,
+      "learning_rate": 0.0002,
+      "loss": 0.8311,
+      "step": 3240
+    },
+    {
+      "epoch": 2.7185278126306986,
+      "grad_norm": 0.882590651512146,
+      "learning_rate": 0.0002,
+      "loss": 0.7968,
+      "step": 3250
+    },
+    {
+      "epoch": 2.726892513592639,
+      "grad_norm": 1.0998138189315796,
+      "learning_rate": 0.0002,
+      "loss": 0.7699,
+      "step": 3260
+    },
+    {
+      "epoch": 2.7352572145545797,
+      "grad_norm": 0.996216356754303,
+      "learning_rate": 0.0002,
+      "loss": 0.6923,
+      "step": 3270
+    },
+    {
+      "epoch": 2.7436219155165205,
+      "grad_norm": 0.8555099368095398,
+      "learning_rate": 0.0002,
+      "loss": 0.7935,
+      "step": 3280
+    },
+    {
+      "epoch": 2.7519866164784608,
+      "grad_norm": 1.1827069520950317,
+      "learning_rate": 0.0002,
+      "loss": 0.7199,
+      "step": 3290
+    },
+    {
+      "epoch": 2.7603513174404015,
+      "grad_norm": 1.0703648328781128,
+      "learning_rate": 0.0002,
+      "loss": 0.8036,
+      "step": 3300
+    },
+    {
+      "epoch": 2.7687160184023423,
+      "grad_norm": 1.3048015832901,
+      "learning_rate": 0.0002,
+      "loss": 0.8174,
+      "step": 3310
+    },
+    {
+      "epoch": 2.7770807193642826,
+      "grad_norm": 1.169602632522583,
+      "learning_rate": 0.0002,
+      "loss": 0.8535,
+      "step": 3320
+    },
+    {
+      "epoch": 2.7854454203262233,
+      "grad_norm": 1.2847896814346313,
+      "learning_rate": 0.0002,
+      "loss": 0.8343,
+      "step": 3330
+    },
+    {
+      "epoch": 2.793810121288164,
+      "grad_norm": 1.2243924140930176,
+      "learning_rate": 0.0002,
+      "loss": 0.855,
+      "step": 3340
+    },
+    {
+      "epoch": 2.8021748222501044,
+      "grad_norm": 0.8506208658218384,
+      "learning_rate": 0.0002,
+      "loss": 0.6935,
+      "step": 3350
+    },
+    {
+      "epoch": 2.810539523212045,
+      "grad_norm": 1.4431512355804443,
+      "learning_rate": 0.0002,
+      "loss": 0.765,
+      "step": 3360
+    },
+    {
+      "epoch": 2.818904224173986,
+      "grad_norm": 1.4266667366027832,
+      "learning_rate": 0.0002,
+      "loss": 0.7762,
+      "step": 3370
+    },
+    {
+      "epoch": 2.827268925135926,
+      "grad_norm": 0.9062533378601074,
+      "learning_rate": 0.0002,
+      "loss": 0.7494,
+      "step": 3380
+    },
+    {
+      "epoch": 2.835633626097867,
+      "grad_norm": 1.1310938596725464,
+      "learning_rate": 0.0002,
+      "loss": 0.7772,
+      "step": 3390
+    },
+    {
+      "epoch": 2.8439983270598077,
+      "grad_norm": 1.0112131834030151,
+      "learning_rate": 0.0002,
+      "loss": 0.7389,
+      "step": 3400
+    },
+    {
+      "epoch": 2.852363028021748,
+      "grad_norm": 2.258208751678467,
+      "learning_rate": 0.0002,
+      "loss": 0.8284,
+      "step": 3410
+    },
+    {
+      "epoch": 2.8607277289836888,
+      "grad_norm": 0.8820013403892517,
+      "learning_rate": 0.0002,
+      "loss": 0.8338,
+      "step": 3420
+    },
+    {
+      "epoch": 2.8690924299456295,
+      "grad_norm": 1.142311453819275,
+      "learning_rate": 0.0002,
+      "loss": 0.805,
+      "step": 3430
+    },
+    {
+      "epoch": 2.87745713090757,
+      "grad_norm": 1.8235642910003662,
+      "learning_rate": 0.0002,
+      "loss": 0.8523,
+      "step": 3440
+    },
+    {
+      "epoch": 2.8858218318695106,
+      "grad_norm": 1.963779330253601,
+      "learning_rate": 0.0002,
+      "loss": 0.783,
+      "step": 3450
+    },
+    {
+      "epoch": 2.8941865328314513,
+      "grad_norm": 1.1153792142868042,
+      "learning_rate": 0.0002,
+      "loss": 0.8492,
+      "step": 3460
+    },
+    {
+      "epoch": 2.9025512337933916,
+      "grad_norm": 1.216044545173645,
+      "learning_rate": 0.0002,
+      "loss": 0.8288,
+      "step": 3470
+    },
+    {
+      "epoch": 2.9109159347553324,
+      "grad_norm": 1.0727925300598145,
+      "learning_rate": 0.0002,
+      "loss": 0.7853,
+      "step": 3480
+    },
+    {
+      "epoch": 2.919280635717273,
+      "grad_norm": 1.4920282363891602,
+      "learning_rate": 0.0002,
+      "loss": 0.7624,
+      "step": 3490
+    },
+    {
+      "epoch": 2.9276453366792135,
+      "grad_norm": 2.0315160751342773,
+      "learning_rate": 0.0002,
+      "loss": 0.8183,
+      "step": 3500
+    },
+    {
+      "epoch": 2.936010037641154,
+      "grad_norm": 1.0698133707046509,
+      "learning_rate": 0.0002,
+      "loss": 0.8094,
+      "step": 3510
+    },
+    {
+      "epoch": 2.944374738603095,
+      "grad_norm": 1.2448259592056274,
+      "learning_rate": 0.0002,
+      "loss": 0.8727,
+      "step": 3520
+    },
+    {
+      "epoch": 2.9527394395650357,
+      "grad_norm": 0.9577398896217346,
+      "learning_rate": 0.0002,
+      "loss": 0.8672,
+      "step": 3530
+    },
+    {
+      "epoch": 2.961104140526976,
+      "grad_norm": 1.1637893915176392,
+      "learning_rate": 0.0002,
+      "loss": 0.8408,
+      "step": 3540
+    },
+    {
+      "epoch": 2.9694688414889168,
+      "grad_norm": 1.5379204750061035,
+      "learning_rate": 0.0002,
+      "loss": 0.719,
+      "step": 3550
+    },
+    {
+      "epoch": 2.9778335424508575,
+      "grad_norm": 1.3025894165039062,
+      "learning_rate": 0.0002,
+      "loss": 0.8086,
+      "step": 3560
+    },
+    {
+      "epoch": 2.9861982434127983,
+      "grad_norm": 1.049248456954956,
+      "learning_rate": 0.0002,
+      "loss": 0.8768,
+      "step": 3570
+    },
+    {
+      "epoch": 2.9945629443747386,
+      "grad_norm": 1.524281620979309,
+      "learning_rate": 0.0002,
+      "loss": 0.7704,
+      "step": 3580
+    },
+    {
+      "epoch": 2.9995817649519028,
+      "eval_loss": 1.2029013633728027,
+      "eval_runtime": 33.4084,
+      "eval_samples_per_second": 13.649,
+      "eval_steps_per_second": 1.706,
+      "step": 3586
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9560,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.573567073353728e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82cc1a869a13836981e3f5a21d92f839005da543aa938bca6e96fe51edb97f77
+size 5624

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cdedb4cb9218424d2baaf3d0c24e7524ec72414ee840dcd4a8f968a85765ff6
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:797b9133905e623e2a9f68e2e5d02fd7dc400fa2b26dc9bd7a85e03eee39a952
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:565621c98a68043d0238d40cf2b8b9361db63beb309d6c0db2ef4ca3e90f1c5e
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce74112639367b34477ae1d8753d64f793092a1fb1f440ea7a3a46bf6bbd00ed
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3411 @@

+{
+  "best_metric": 1.1764153242111206,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-10000/checkpoint-2391",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 4782,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008364700961940611,
+      "grad_norm": 1.2661447525024414,
+      "learning_rate": 0.0002,
+      "loss": 1.9268,
+      "step": 10
+    },
+    {
+      "epoch": 0.016729401923881223,
+      "grad_norm": 1.3240571022033691,
+      "learning_rate": 0.0002,
+      "loss": 1.6326,
+      "step": 20
+    },
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 0.7347124218940735,
+      "learning_rate": 0.0002,
+      "loss": 1.507,
+      "step": 30
+    },
+    {
+      "epoch": 0.033458803847762446,
+      "grad_norm": 0.9849194288253784,
+      "learning_rate": 0.0002,
+      "loss": 1.5363,
+      "step": 40
+    },
+    {
+      "epoch": 0.04182350480970305,
+      "grad_norm": 0.9093025326728821,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 50
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 0.737514078617096,
+      "learning_rate": 0.0002,
+      "loss": 1.2542,
+      "step": 60
+    },
+    {
+      "epoch": 0.05855290673358427,
+      "grad_norm": 1.3245333433151245,
+      "learning_rate": 0.0002,
+      "loss": 1.2172,
+      "step": 70
+    },
+    {
+      "epoch": 0.06691760769552489,
+      "grad_norm": 0.7806007862091064,
+      "learning_rate": 0.0002,
+      "loss": 1.2478,
+      "step": 80
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 0.6627500057220459,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 90
+    },
+    {
+      "epoch": 0.0836470096194061,
+      "grad_norm": 1.0278682708740234,
+      "learning_rate": 0.0002,
+      "loss": 1.1363,
+      "step": 100
+    },
+    {
+      "epoch": 0.09201171058134672,
+      "grad_norm": 0.7746474146842957,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 110
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 0.5935637950897217,
+      "learning_rate": 0.0002,
+      "loss": 1.1171,
+      "step": 120
+    },
+    {
+      "epoch": 0.10874111250522794,
+      "grad_norm": 0.6738003492355347,
+      "learning_rate": 0.0002,
+      "loss": 1.1625,
+      "step": 130
+    },
+    {
+      "epoch": 0.11710581346716854,
+      "grad_norm": 0.6447349190711975,
+      "learning_rate": 0.0002,
+      "loss": 1.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 0.5628684759140015,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 150
+    },
+    {
+      "epoch": 0.13383521539104978,
+      "grad_norm": 0.7446871399879456,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 160
+    },
+    {
+      "epoch": 0.14219991635299037,
+      "grad_norm": 0.5214279294013977,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 170
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 0.5324464440345764,
+      "learning_rate": 0.0002,
+      "loss": 1.2612,
+      "step": 180
+    },
+    {
+      "epoch": 0.1589293182768716,
+      "grad_norm": 0.5539828538894653,
+      "learning_rate": 0.0002,
+      "loss": 1.3168,
+      "step": 190
+    },
+    {
+      "epoch": 0.1672940192388122,
+      "grad_norm": 0.5192331671714783,
+      "learning_rate": 0.0002,
+      "loss": 1.0835,
+      "step": 200
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 0.7160783410072327,
+      "learning_rate": 0.0002,
+      "loss": 1.1799,
+      "step": 210
+    },
+    {
+      "epoch": 0.18402342116269343,
+      "grad_norm": 0.8428353071212769,
+      "learning_rate": 0.0002,
+      "loss": 1.1527,
+      "step": 220
+    },
+    {
+      "epoch": 0.19238812212463405,
+      "grad_norm": 0.493561714887619,
+      "learning_rate": 0.0002,
+      "loss": 1.1284,
+      "step": 230
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 2.522308111190796,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 240
+    },
+    {
+      "epoch": 0.20911752404851527,
+      "grad_norm": 0.7338423728942871,
+      "learning_rate": 0.0002,
+      "loss": 1.1459,
+      "step": 250
+    },
+    {
+      "epoch": 0.2174822250104559,
+      "grad_norm": 0.6501832604408264,
+      "learning_rate": 0.0002,
+      "loss": 1.1311,
+      "step": 260
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.6331472992897034,
+      "learning_rate": 0.0002,
+      "loss": 1.2241,
+      "step": 270
+    },
+    {
+      "epoch": 0.23421162693433709,
+      "grad_norm": 0.5653548836708069,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 280
+    },
+    {
+      "epoch": 0.2425763278962777,
+      "grad_norm": 0.5833444595336914,
+      "learning_rate": 0.0002,
+      "loss": 1.119,
+      "step": 290
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.6707335114479065,
+      "learning_rate": 0.0002,
+      "loss": 1.2157,
+      "step": 300
+    },
+    {
+      "epoch": 0.2593057298201589,
+      "grad_norm": 0.5435659885406494,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 310
+    },
+    {
+      "epoch": 0.26767043078209957,
+      "grad_norm": 0.5752334594726562,
+      "learning_rate": 0.0002,
+      "loss": 1.0781,
+      "step": 320
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 0.5790163278579712,
+      "learning_rate": 0.0002,
+      "loss": 1.0493,
+      "step": 330
+    },
+    {
+      "epoch": 0.28439983270598074,
+      "grad_norm": 0.46593040227890015,
+      "learning_rate": 0.0002,
+      "loss": 1.2281,
+      "step": 340
+    },
+    {
+      "epoch": 0.2927645336679214,
+      "grad_norm": 0.7713788151741028,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 350
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 0.7719253301620483,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 360
+    },
+    {
+      "epoch": 0.3094939355918026,
+      "grad_norm": 0.7065562605857849,
+      "learning_rate": 0.0002,
+      "loss": 1.0884,
+      "step": 370
+    },
+    {
+      "epoch": 0.3178586365537432,
+      "grad_norm": 0.7082679271697998,
+      "learning_rate": 0.0002,
+      "loss": 1.0902,
+      "step": 380
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.5779536366462708,
+      "learning_rate": 0.0002,
+      "loss": 1.1696,
+      "step": 390
+    },
+    {
+      "epoch": 0.3345880384776244,
+      "grad_norm": 0.6321173310279846,
+      "learning_rate": 0.0002,
+      "loss": 1.1916,
+      "step": 400
+    },
+    {
+      "epoch": 0.34295273943956506,
+      "grad_norm": 0.7237968444824219,
+      "learning_rate": 0.0002,
+      "loss": 1.1419,
+      "step": 410
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 0.6730817556381226,
+      "learning_rate": 0.0002,
+      "loss": 0.9877,
+      "step": 420
+    },
+    {
+      "epoch": 0.3596821413634463,
+      "grad_norm": 0.6245285868644714,
+      "learning_rate": 0.0002,
+      "loss": 1.108,
+      "step": 430
+    },
+    {
+      "epoch": 0.36804684232538687,
+      "grad_norm": 0.9926134347915649,
+      "learning_rate": 0.0002,
+      "loss": 1.209,
+      "step": 440
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 0.5567468404769897,
+      "learning_rate": 0.0002,
+      "loss": 1.0664,
+      "step": 450
+    },
+    {
+      "epoch": 0.3847762442492681,
+      "grad_norm": 0.5764540433883667,
+      "learning_rate": 0.0002,
+      "loss": 1.1838,
+      "step": 460
+    },
+    {
+      "epoch": 0.3931409452112087,
+      "grad_norm": 1.1908321380615234,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 470
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 0.6756157875061035,
+      "learning_rate": 0.0002,
+      "loss": 1.1601,
+      "step": 480
+    },
+    {
+      "epoch": 0.4098703471350899,
+      "grad_norm": 0.5793355107307434,
+      "learning_rate": 0.0002,
+      "loss": 1.1703,
+      "step": 490
+    },
+    {
+      "epoch": 0.41823504809703055,
+      "grad_norm": 0.6145297288894653,
+      "learning_rate": 0.0002,
+      "loss": 1.1289,
+      "step": 500
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 0.48073795437812805,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 510
+    },
+    {
+      "epoch": 0.4349644500209118,
+      "grad_norm": 0.802431046962738,
+      "learning_rate": 0.0002,
+      "loss": 1.1335,
+      "step": 520
+    },
+    {
+      "epoch": 0.44332915098285236,
+      "grad_norm": 0.5906000137329102,
+      "learning_rate": 0.0002,
+      "loss": 1.0574,
+      "step": 530
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5615521669387817,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 540
+    },
+    {
+      "epoch": 0.4600585529067336,
+      "grad_norm": 0.5688650012016296,
+      "learning_rate": 0.0002,
+      "loss": 1.2228,
+      "step": 550
+    },
+    {
+      "epoch": 0.46842325386867417,
+      "grad_norm": 0.7505079507827759,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 560
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.6905680298805237,
+      "learning_rate": 0.0002,
+      "loss": 1.1566,
+      "step": 570
+    },
+    {
+      "epoch": 0.4851526557925554,
+      "grad_norm": 0.5885183811187744,
+      "learning_rate": 0.0002,
+      "loss": 1.1256,
+      "step": 580
+    },
+    {
+      "epoch": 0.49351735675449604,
+      "grad_norm": 0.7367458343505859,
+      "learning_rate": 0.0002,
+      "loss": 1.211,
+      "step": 590
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 0.9157859086990356,
+      "learning_rate": 0.0002,
+      "loss": 1.1215,
+      "step": 600
+    },
+    {
+      "epoch": 0.5102467586783772,
+      "grad_norm": 0.49971529841423035,
+      "learning_rate": 0.0002,
+      "loss": 1.3101,
+      "step": 610
+    },
+    {
+      "epoch": 0.5186114596403179,
+      "grad_norm": 0.5031328797340393,
+      "learning_rate": 0.0002,
+      "loss": 1.1223,
+      "step": 620
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 0.6945798397064209,
+      "learning_rate": 0.0002,
+      "loss": 1.154,
+      "step": 630
+    },
+    {
+      "epoch": 0.5353408615641991,
+      "grad_norm": 0.7563218474388123,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 640
+    },
+    {
+      "epoch": 0.5437055625261397,
+      "grad_norm": 0.9215132594108582,
+      "learning_rate": 0.0002,
+      "loss": 1.2364,
+      "step": 650
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 1.0132478475570679,
+      "learning_rate": 0.0002,
+      "loss": 1.2179,
+      "step": 660
+    },
+    {
+      "epoch": 0.560434964450021,
+      "grad_norm": 1.448024868965149,
+      "learning_rate": 0.0002,
+      "loss": 1.1016,
+      "step": 670
+    },
+    {
+      "epoch": 0.5687996654119615,
+      "grad_norm": 0.7022866010665894,
+      "learning_rate": 0.0002,
+      "loss": 1.1918,
+      "step": 680
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 0.7366224527359009,
+      "learning_rate": 0.0002,
+      "loss": 1.1108,
+      "step": 690
+    },
+    {
+      "epoch": 0.5855290673358428,
+      "grad_norm": 0.722874641418457,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 700
+    },
+    {
+      "epoch": 0.5938937682977834,
+      "grad_norm": 1.0756473541259766,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 710
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.607101559638977,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 720
+    },
+    {
+      "epoch": 0.6106231702216646,
+      "grad_norm": 0.7424359917640686,
+      "learning_rate": 0.0002,
+      "loss": 1.1561,
+      "step": 730
+    },
+    {
+      "epoch": 0.6189878711836052,
+      "grad_norm": 0.7123169898986816,
+      "learning_rate": 0.0002,
+      "loss": 1.1124,
+      "step": 740
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 0.672195315361023,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 750
+    },
+    {
+      "epoch": 0.6357172731074864,
+      "grad_norm": 0.8329780697822571,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 760
+    },
+    {
+      "epoch": 0.644081974069427,
+      "grad_norm": 0.7011522650718689,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 770
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 0.6425889730453491,
+      "learning_rate": 0.0002,
+      "loss": 1.2505,
+      "step": 780
+    },
+    {
+      "epoch": 0.6608113759933082,
+      "grad_norm": 0.8729137182235718,
+      "learning_rate": 0.0002,
+      "loss": 1.2005,
+      "step": 790
+    },
+    {
+      "epoch": 0.6691760769552488,
+      "grad_norm": 0.5885024070739746,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 800
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.526979386806488,
+      "learning_rate": 0.0002,
+      "loss": 1.1901,
+      "step": 810
+    },
+    {
+      "epoch": 0.6859054788791301,
+      "grad_norm": 0.998365044593811,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 820
+    },
+    {
+      "epoch": 0.6942701798410706,
+      "grad_norm": 0.6049501299858093,
+      "learning_rate": 0.0002,
+      "loss": 1.0278,
+      "step": 830
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 0.7015583515167236,
+      "learning_rate": 0.0002,
+      "loss": 1.1102,
+      "step": 840
+    },
+    {
+      "epoch": 0.7109995817649519,
+      "grad_norm": 0.5852547883987427,
+      "learning_rate": 0.0002,
+      "loss": 1.1041,
+      "step": 850
+    },
+    {
+      "epoch": 0.7193642827268926,
+      "grad_norm": 0.6017204523086548,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 860
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.7195692658424377,
+      "learning_rate": 0.0002,
+      "loss": 1.0611,
+      "step": 870
+    },
+    {
+      "epoch": 0.7360936846507737,
+      "grad_norm": 0.8087519407272339,
+      "learning_rate": 0.0002,
+      "loss": 1.1497,
+      "step": 880
+    },
+    {
+      "epoch": 0.7444583856127144,
+      "grad_norm": 0.988362193107605,
+      "learning_rate": 0.0002,
+      "loss": 1.1087,
+      "step": 890
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.6142330765724182,
+      "learning_rate": 0.0002,
+      "loss": 1.049,
+      "step": 900
+    },
+    {
+      "epoch": 0.7611877875365956,
+      "grad_norm": 0.6751818656921387,
+      "learning_rate": 0.0002,
+      "loss": 1.0388,
+      "step": 910
+    },
+    {
+      "epoch": 0.7695524884985362,
+      "grad_norm": 0.7528653740882874,
+      "learning_rate": 0.0002,
+      "loss": 1.2125,
+      "step": 920
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.613039493560791,
+      "learning_rate": 0.0002,
+      "loss": 0.9926,
+      "step": 930
+    },
+    {
+      "epoch": 0.7862818904224174,
+      "grad_norm": 0.8040242791175842,
+      "learning_rate": 0.0002,
+      "loss": 1.2582,
+      "step": 940
+    },
+    {
+      "epoch": 0.794646591384358,
+      "grad_norm": 0.5306838154792786,
+      "learning_rate": 0.0002,
+      "loss": 1.1397,
+      "step": 950
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.7037438750267029,
+      "learning_rate": 0.0002,
+      "loss": 1.0303,
+      "step": 960
+    },
+    {
+      "epoch": 0.8113759933082393,
+      "grad_norm": 0.6726985573768616,
+      "learning_rate": 0.0002,
+      "loss": 1.1531,
+      "step": 970
+    },
+    {
+      "epoch": 0.8197406942701798,
+      "grad_norm": 0.9324426651000977,
+      "learning_rate": 0.0002,
+      "loss": 1.125,
+      "step": 980
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.5811492204666138,
+      "learning_rate": 0.0002,
+      "loss": 1.0744,
+      "step": 990
+    },
+    {
+      "epoch": 0.8364700961940611,
+      "grad_norm": 0.6894899606704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8448347971560016,
+      "grad_norm": 0.5663559436798096,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 0.5555400252342224,
+      "learning_rate": 0.0002,
+      "loss": 1.0337,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8615641990798829,
+      "grad_norm": 0.4418621063232422,
+      "learning_rate": 0.0002,
+      "loss": 1.1086,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8699289000418235,
+      "grad_norm": 0.7832980751991272,
+      "learning_rate": 0.0002,
+      "loss": 1.1291,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.6883782744407654,
+      "learning_rate": 0.0002,
+      "loss": 1.1538,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8866583019657047,
+      "grad_norm": 0.5617508888244629,
+      "learning_rate": 0.0002,
+      "loss": 1.0311,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8950230029276454,
+      "grad_norm": 0.723233699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1869,
+      "step": 1070
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 2.8922297954559326,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9117524048515265,
+      "grad_norm": 1.5861668586730957,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9201171058134672,
+      "grad_norm": 0.6625565886497498,
+      "learning_rate": 0.0002,
+      "loss": 1.0758,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.6424002647399902,
+      "learning_rate": 0.0002,
+      "loss": 1.2524,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9368465077373483,
+      "grad_norm": 0.7253570556640625,
+      "learning_rate": 0.0002,
+      "loss": 1.0261,
+      "step": 1120
+    },
+    {
+      "epoch": 0.945211208699289,
+      "grad_norm": 0.6529237627983093,
+      "learning_rate": 0.0002,
+      "loss": 1.2131,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.7082931399345398,
+      "learning_rate": 0.0002,
+      "loss": 1.0705,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9619406106231703,
+      "grad_norm": 1.10663902759552,
+      "learning_rate": 0.0002,
+      "loss": 1.2197,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9703053115851108,
+      "grad_norm": 0.6979895830154419,
+      "learning_rate": 0.0002,
+      "loss": 1.1051,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.896873950958252,
+      "learning_rate": 0.0002,
+      "loss": 1.1516,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9870347135089921,
+      "grad_norm": 0.5664224624633789,
+      "learning_rate": 0.0002,
+      "loss": 1.0224,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9953994144709327,
+      "grad_norm": 0.6827336549758911,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 1190
+    },
+    {
+      "epoch": 0.999581764951903,
+      "eval_loss": 1.1974399089813232,
+      "eval_runtime": 83.0008,
+      "eval_samples_per_second": 5.494,
+      "eval_steps_per_second": 0.687,
+      "step": 1195
+    },
+    {
+      "epoch": 1.0037641154328734,
+      "grad_norm": 0.5443172454833984,
+      "learning_rate": 0.0002,
+      "loss": 1.0743,
+      "step": 1200
+    },
+    {
+      "epoch": 1.012128816394814,
+      "grad_norm": 0.71578449010849,
+      "learning_rate": 0.0002,
+      "loss": 0.9491,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0204935173567544,
+      "grad_norm": 0.681245744228363,
+      "learning_rate": 0.0002,
+      "loss": 0.8987,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0288582183186952,
+      "grad_norm": 0.5959660410881042,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0372229192806357,
+      "grad_norm": 0.581801176071167,
+      "learning_rate": 0.0002,
+      "loss": 0.9537,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0455876202425762,
+      "grad_norm": 0.6427032947540283,
+      "learning_rate": 0.0002,
+      "loss": 0.9291,
+      "step": 1250
+    },
+    {
+      "epoch": 1.053952321204517,
+      "grad_norm": 1.2949297428131104,
+      "learning_rate": 0.0002,
+      "loss": 1.0572,
+      "step": 1260
+    },
+    {
+      "epoch": 1.0623170221664575,
+      "grad_norm": 0.7161147594451904,
+      "learning_rate": 0.0002,
+      "loss": 0.876,
+      "step": 1270
+    },
+    {
+      "epoch": 1.070681723128398,
+      "grad_norm": 0.8515461087226868,
+      "learning_rate": 0.0002,
+      "loss": 0.9624,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0790464240903388,
+      "grad_norm": 0.9086605906486511,
+      "learning_rate": 0.0002,
+      "loss": 1.0332,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0874111250522793,
+      "grad_norm": 0.525374174118042,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1300
+    },
+    {
+      "epoch": 1.09577582601422,
+      "grad_norm": 0.6631740927696228,
+      "learning_rate": 0.0002,
+      "loss": 0.987,
+      "step": 1310
+    },
+    {
+      "epoch": 1.1041405269761606,
+      "grad_norm": 0.8387110233306885,
+      "learning_rate": 0.0002,
+      "loss": 1.0077,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1125052279381011,
+      "grad_norm": 0.8402808308601379,
+      "learning_rate": 0.0002,
+      "loss": 1.0299,
+      "step": 1330
+    },
+    {
+      "epoch": 1.120869928900042,
+      "grad_norm": 0.6945340037345886,
+      "learning_rate": 0.0002,
+      "loss": 0.9625,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1292346298619824,
+      "grad_norm": 0.6942460536956787,
+      "learning_rate": 0.0002,
+      "loss": 0.9181,
+      "step": 1350
+    },
+    {
+      "epoch": 1.137599330823923,
+      "grad_norm": 0.7074856758117676,
+      "learning_rate": 0.0002,
+      "loss": 1.0279,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1459640317858637,
+      "grad_norm": 0.6957907676696777,
+      "learning_rate": 0.0002,
+      "loss": 0.9177,
+      "step": 1370
+    },
+    {
+      "epoch": 1.1543287327478042,
+      "grad_norm": 0.7241228818893433,
+      "learning_rate": 0.0002,
+      "loss": 1.0561,
+      "step": 1380
+    },
+    {
+      "epoch": 1.162693433709745,
+      "grad_norm": 1.2119261026382446,
+      "learning_rate": 0.0002,
+      "loss": 0.974,
+      "step": 1390
+    },
+    {
+      "epoch": 1.1710581346716855,
+      "grad_norm": 0.7284879684448242,
+      "learning_rate": 0.0002,
+      "loss": 0.9813,
+      "step": 1400
+    },
+    {
+      "epoch": 1.179422835633626,
+      "grad_norm": 0.702438473701477,
+      "learning_rate": 0.0002,
+      "loss": 0.9153,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1877875365955668,
+      "grad_norm": 0.9390414357185364,
+      "learning_rate": 0.0002,
+      "loss": 1.0409,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1961522375575073,
+      "grad_norm": 0.8179782629013062,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2045169385194479,
+      "grad_norm": 1.4885749816894531,
+      "learning_rate": 0.0002,
+      "loss": 1.0606,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2128816394813886,
+      "grad_norm": 0.868131697177887,
+      "learning_rate": 0.0002,
+      "loss": 0.9497,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2212463404433291,
+      "grad_norm": 0.8125514388084412,
+      "learning_rate": 0.0002,
+      "loss": 0.9398,
+      "step": 1460
+    },
+    {
+      "epoch": 1.2296110414052697,
+      "grad_norm": 0.633736789226532,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2379757423672104,
+      "grad_norm": 0.6061311364173889,
+      "learning_rate": 0.0002,
+      "loss": 0.9484,
+      "step": 1480
+    },
+    {
+      "epoch": 1.246340443329151,
+      "grad_norm": 0.6683570742607117,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2547051442910915,
+      "grad_norm": 0.6832399964332581,
+      "learning_rate": 0.0002,
+      "loss": 0.9645,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2630698452530322,
+      "grad_norm": 0.7690117955207825,
+      "learning_rate": 0.0002,
+      "loss": 0.9892,
+      "step": 1510
+    },
+    {
+      "epoch": 1.2714345462149728,
+      "grad_norm": 0.7987741231918335,
+      "learning_rate": 0.0002,
+      "loss": 1.0383,
+      "step": 1520
+    },
+    {
+      "epoch": 1.2797992471769133,
+      "grad_norm": 0.527604877948761,
+      "learning_rate": 0.0002,
+      "loss": 0.9531,
+      "step": 1530
+    },
+    {
+      "epoch": 1.288163948138854,
+      "grad_norm": 0.6243641376495361,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2965286491007946,
+      "grad_norm": 0.7621095776557922,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1550
+    },
+    {
+      "epoch": 1.3048933500627353,
+      "grad_norm": 0.7913159728050232,
+      "learning_rate": 0.0002,
+      "loss": 1.0546,
+      "step": 1560
+    },
+    {
+      "epoch": 1.3132580510246759,
+      "grad_norm": 0.9507867693901062,
+      "learning_rate": 0.0002,
+      "loss": 0.9793,
+      "step": 1570
+    },
+    {
+      "epoch": 1.3216227519866166,
+      "grad_norm": 0.7301706075668335,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3299874529485571,
+      "grad_norm": 0.7653141021728516,
+      "learning_rate": 0.0002,
+      "loss": 1.0031,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3383521539104977,
+      "grad_norm": 0.6372700333595276,
+      "learning_rate": 0.0002,
+      "loss": 0.8704,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3467168548724384,
+      "grad_norm": 1.7866026163101196,
+      "learning_rate": 0.0002,
+      "loss": 1.0373,
+      "step": 1610
+    },
+    {
+      "epoch": 1.355081555834379,
+      "grad_norm": 0.6353244781494141,
+      "learning_rate": 0.0002,
+      "loss": 0.9118,
+      "step": 1620
+    },
+    {
+      "epoch": 1.3634462567963195,
+      "grad_norm": 0.7673062086105347,
+      "learning_rate": 0.0002,
+      "loss": 1.0048,
+      "step": 1630
+    },
+    {
+      "epoch": 1.3718109577582602,
+      "grad_norm": 1.1364117860794067,
+      "learning_rate": 0.0002,
+      "loss": 0.9797,
+      "step": 1640
+    },
+    {
+      "epoch": 1.3801756587202008,
+      "grad_norm": 1.0685369968414307,
+      "learning_rate": 0.0002,
+      "loss": 0.8953,
+      "step": 1650
+    },
+    {
+      "epoch": 1.3885403596821413,
+      "grad_norm": 1.1614553928375244,
+      "learning_rate": 0.0002,
+      "loss": 0.9533,
+      "step": 1660
+    },
+    {
+      "epoch": 1.396905060644082,
+      "grad_norm": 1.2501142024993896,
+      "learning_rate": 0.0002,
+      "loss": 1.0274,
+      "step": 1670
+    },
+    {
+      "epoch": 1.4052697616060226,
+      "grad_norm": 1.0739696025848389,
+      "learning_rate": 0.0002,
+      "loss": 0.9498,
+      "step": 1680
+    },
+    {
+      "epoch": 1.413634462567963,
+      "grad_norm": 0.800770103931427,
+      "learning_rate": 0.0002,
+      "loss": 1.0329,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4219991635299039,
+      "grad_norm": 0.6980189085006714,
+      "learning_rate": 0.0002,
+      "loss": 1.1194,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4303638644918444,
+      "grad_norm": 0.9088300466537476,
+      "learning_rate": 0.0002,
+      "loss": 0.9536,
+      "step": 1710
+    },
+    {
+      "epoch": 1.438728565453785,
+      "grad_norm": 1.0146790742874146,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4470932664157257,
+      "grad_norm": 2.0795905590057373,
+      "learning_rate": 0.0002,
+      "loss": 1.1158,
+      "step": 1730
+    },
+    {
+      "epoch": 1.4554579673776662,
+      "grad_norm": 0.7743622064590454,
+      "learning_rate": 0.0002,
+      "loss": 0.9421,
+      "step": 1740
+    },
+    {
+      "epoch": 1.4638226683396067,
+      "grad_norm": 0.9682395458221436,
+      "learning_rate": 0.0002,
+      "loss": 1.0351,
+      "step": 1750
+    },
+    {
+      "epoch": 1.4721873693015475,
+      "grad_norm": 0.905489981174469,
+      "learning_rate": 0.0002,
+      "loss": 1.0185,
+      "step": 1760
+    },
+    {
+      "epoch": 1.480552070263488,
+      "grad_norm": 1.1918401718139648,
+      "learning_rate": 0.0002,
+      "loss": 1.0104,
+      "step": 1770
+    },
+    {
+      "epoch": 1.4889167712254288,
+      "grad_norm": 0.5931059122085571,
+      "learning_rate": 0.0002,
+      "loss": 0.9078,
+      "step": 1780
+    },
+    {
+      "epoch": 1.4972814721873693,
+      "grad_norm": 1.197264552116394,
+      "learning_rate": 0.0002,
+      "loss": 0.9916,
+      "step": 1790
+    },
+    {
+      "epoch": 1.50564617314931,
+      "grad_norm": 1.4029070138931274,
+      "learning_rate": 0.0002,
+      "loss": 0.9754,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5140108741112506,
+      "grad_norm": 0.8593041896820068,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1810
+    },
+    {
+      "epoch": 1.522375575073191,
+      "grad_norm": 0.750442624092102,
+      "learning_rate": 0.0002,
+      "loss": 1.0252,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5307402760351319,
+      "grad_norm": 0.7551209330558777,
+      "learning_rate": 0.0002,
+      "loss": 0.9184,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5391049769970724,
+      "grad_norm": 0.7432758808135986,
+      "learning_rate": 0.0002,
+      "loss": 0.9508,
+      "step": 1840
+    },
+    {
+      "epoch": 1.547469677959013,
+      "grad_norm": 1.0624628067016602,
+      "learning_rate": 0.0002,
+      "loss": 1.0975,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5558343789209537,
+      "grad_norm": 0.8789014220237732,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1860
+    },
+    {
+      "epoch": 1.5641990798828942,
+      "grad_norm": 0.7802485823631287,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.5725637808448347,
+      "grad_norm": 1.129615306854248,
+      "learning_rate": 0.0002,
+      "loss": 0.922,
+      "step": 1880
+    },
+    {
+      "epoch": 1.5809284818067755,
+      "grad_norm": 1.0759961605072021,
+      "learning_rate": 0.0002,
+      "loss": 0.9252,
+      "step": 1890
+    },
+    {
+      "epoch": 1.589293182768716,
+      "grad_norm": 1.0037081241607666,
+      "learning_rate": 0.0002,
+      "loss": 0.9473,
+      "step": 1900
+    },
+    {
+      "epoch": 1.5976578837306565,
+      "grad_norm": 0.6003720164299011,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6060225846925973,
+      "grad_norm": 0.7846575975418091,
+      "learning_rate": 0.0002,
+      "loss": 0.9492,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6143872856545378,
+      "grad_norm": 0.9737453460693359,
+      "learning_rate": 0.0002,
+      "loss": 1.0247,
+      "step": 1930
+    },
+    {
+      "epoch": 1.6227519866164783,
+      "grad_norm": 0.9219926595687866,
+      "learning_rate": 0.0002,
+      "loss": 0.9906,
+      "step": 1940
+    },
+    {
+      "epoch": 1.631116687578419,
+      "grad_norm": 0.7196545004844666,
+      "learning_rate": 0.0002,
+      "loss": 0.9447,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6394813885403596,
+      "grad_norm": 0.9171157479286194,
+      "learning_rate": 0.0002,
+      "loss": 1.0166,
+      "step": 1960
+    },
+    {
+      "epoch": 1.6478460895023002,
+      "grad_norm": 0.9991112351417542,
+      "learning_rate": 0.0002,
+      "loss": 0.9248,
+      "step": 1970
+    },
+    {
+      "epoch": 1.656210790464241,
+      "grad_norm": 1.3650590181350708,
+      "learning_rate": 0.0002,
+      "loss": 0.9775,
+      "step": 1980
+    },
+    {
+      "epoch": 1.6645754914261817,
+      "grad_norm": 0.9693202376365662,
+      "learning_rate": 0.0002,
+      "loss": 0.9501,
+      "step": 1990
+    },
+    {
+      "epoch": 1.672940192388122,
+      "grad_norm": 0.9004108309745789,
+      "learning_rate": 0.0002,
+      "loss": 1.0348,
+      "step": 2000
+    },
+    {
+      "epoch": 1.6813048933500627,
+      "grad_norm": 1.3959358930587769,
+      "learning_rate": 0.0002,
+      "loss": 1.0098,
+      "step": 2010
+    },
+    {
+      "epoch": 1.6896695943120035,
+      "grad_norm": 1.6159738302230835,
+      "learning_rate": 0.0002,
+      "loss": 1.0145,
+      "step": 2020
+    },
+    {
+      "epoch": 1.698034295273944,
+      "grad_norm": 1.1095340251922607,
+      "learning_rate": 0.0002,
+      "loss": 0.9109,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7063989962358845,
+      "grad_norm": 0.9950175881385803,
+      "learning_rate": 0.0002,
+      "loss": 0.9436,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7147636971978253,
+      "grad_norm": 0.8590125441551208,
+      "learning_rate": 0.0002,
+      "loss": 1.0235,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7231283981597658,
+      "grad_norm": 0.7302223443984985,
+      "learning_rate": 0.0002,
+      "loss": 0.9384,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7314930991217063,
+      "grad_norm": 1.0173848867416382,
+      "learning_rate": 0.0002,
+      "loss": 1.0449,
+      "step": 2070
+    },
+    {
+      "epoch": 1.739857800083647,
+      "grad_norm": 0.6308056712150574,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 2080
+    },
+    {
+      "epoch": 1.7482225010455876,
+      "grad_norm": 1.2122596502304077,
+      "learning_rate": 0.0002,
+      "loss": 0.9637,
+      "step": 2090
+    },
+    {
+      "epoch": 1.7565872020075282,
+      "grad_norm": 1.2666280269622803,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 2100
+    },
+    {
+      "epoch": 1.764951902969469,
+      "grad_norm": 1.310709834098816,
+      "learning_rate": 0.0002,
+      "loss": 0.9263,
+      "step": 2110
+    },
+    {
+      "epoch": 1.7733166039314094,
+      "grad_norm": 0.8790634870529175,
+      "learning_rate": 0.0002,
+      "loss": 0.9711,
+      "step": 2120
+    },
+    {
+      "epoch": 1.78168130489335,
+      "grad_norm": 0.8222663998603821,
+      "learning_rate": 0.0002,
+      "loss": 0.9456,
+      "step": 2130
+    },
+    {
+      "epoch": 1.7900460058552907,
+      "grad_norm": 0.6637442708015442,
+      "learning_rate": 0.0002,
+      "loss": 0.9014,
+      "step": 2140
+    },
+    {
+      "epoch": 1.7984107068172313,
+      "grad_norm": 1.2613177299499512,
+      "learning_rate": 0.0002,
+      "loss": 1.0201,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8067754077791718,
+      "grad_norm": 0.6381147503852844,
+      "learning_rate": 0.0002,
+      "loss": 0.8389,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8151401087411125,
+      "grad_norm": 1.5663173198699951,
+      "learning_rate": 0.0002,
+      "loss": 0.9553,
+      "step": 2170
+    },
+    {
+      "epoch": 1.823504809703053,
+      "grad_norm": 0.8651582598686218,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2180
+    },
+    {
+      "epoch": 1.8318695106649936,
+      "grad_norm": 0.7086225152015686,
+      "learning_rate": 0.0002,
+      "loss": 0.8555,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8402342116269343,
+      "grad_norm": 1.0986076593399048,
+      "learning_rate": 0.0002,
+      "loss": 0.9588,
+      "step": 2200
+    },
+    {
+      "epoch": 1.848598912588875,
+      "grad_norm": 1.0471370220184326,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2210
+    },
+    {
+      "epoch": 1.8569636135508154,
+      "grad_norm": 0.8230622410774231,
+      "learning_rate": 0.0002,
+      "loss": 1.0301,
+      "step": 2220
+    },
+    {
+      "epoch": 1.8653283145127562,
+      "grad_norm": 1.093545913696289,
+      "learning_rate": 0.0002,
+      "loss": 0.9191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.873693015474697,
+      "grad_norm": 0.8182677626609802,
+      "learning_rate": 0.0002,
+      "loss": 0.8441,
+      "step": 2240
+    },
+    {
+      "epoch": 1.8820577164366374,
+      "grad_norm": 0.9356469511985779,
+      "learning_rate": 0.0002,
+      "loss": 0.901,
+      "step": 2250
+    },
+    {
+      "epoch": 1.890422417398578,
+      "grad_norm": 0.8871003985404968,
+      "learning_rate": 0.0002,
+      "loss": 0.9803,
+      "step": 2260
+    },
+    {
+      "epoch": 1.8987871183605187,
+      "grad_norm": 1.0431411266326904,
+      "learning_rate": 0.0002,
+      "loss": 1.0625,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9071518193224593,
+      "grad_norm": 1.3339753150939941,
+      "learning_rate": 0.0002,
+      "loss": 0.9897,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9155165202843998,
+      "grad_norm": 0.9365147352218628,
+      "learning_rate": 0.0002,
+      "loss": 0.9742,
+      "step": 2290
+    },
+    {
+      "epoch": 1.9238812212463405,
+      "grad_norm": 0.721367359161377,
+      "learning_rate": 0.0002,
+      "loss": 0.9071,
+      "step": 2300
+    },
+    {
+      "epoch": 1.932245922208281,
+      "grad_norm": 1.0150835514068604,
+      "learning_rate": 0.0002,
+      "loss": 0.9101,
+      "step": 2310
+    },
+    {
+      "epoch": 1.9406106231702216,
+      "grad_norm": 0.7709364891052246,
+      "learning_rate": 0.0002,
+      "loss": 0.9583,
+      "step": 2320
+    },
+    {
+      "epoch": 1.9489753241321623,
+      "grad_norm": 1.035475254058838,
+      "learning_rate": 0.0002,
+      "loss": 0.9725,
+      "step": 2330
+    },
+    {
+      "epoch": 1.9573400250941029,
+      "grad_norm": 1.641360878944397,
+      "learning_rate": 0.0002,
+      "loss": 1.0028,
+      "step": 2340
+    },
+    {
+      "epoch": 1.9657047260560434,
+      "grad_norm": 1.6609785556793213,
+      "learning_rate": 0.0002,
+      "loss": 0.9478,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9740694270179842,
+      "grad_norm": 1.0421160459518433,
+      "learning_rate": 0.0002,
+      "loss": 0.9369,
+      "step": 2360
+    },
+    {
+      "epoch": 1.9824341279799247,
+      "grad_norm": 0.5951679944992065,
+      "learning_rate": 0.0002,
+      "loss": 0.9603,
+      "step": 2370
+    },
+    {
+      "epoch": 1.9907988289418652,
+      "grad_norm": 1.2476773262023926,
+      "learning_rate": 0.0002,
+      "loss": 0.9483,
+      "step": 2380
+    },
+    {
+      "epoch": 1.999163529903806,
+      "grad_norm": 1.0104742050170898,
+      "learning_rate": 0.0002,
+      "loss": 0.9474,
+      "step": 2390
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1764153242111206,
+      "eval_runtime": 82.769,
+      "eval_samples_per_second": 5.509,
+      "eval_steps_per_second": 0.689,
+      "step": 2391
+    },
+    {
+      "epoch": 2.0075282308657467,
+      "grad_norm": 0.7234944105148315,
+      "learning_rate": 0.0002,
+      "loss": 0.8288,
+      "step": 2400
+    },
+    {
+      "epoch": 2.015892931827687,
+      "grad_norm": 0.9550215601921082,
+      "learning_rate": 0.0002,
+      "loss": 0.7429,
+      "step": 2410
+    },
+    {
+      "epoch": 2.024257632789628,
+      "grad_norm": 1.2027040719985962,
+      "learning_rate": 0.0002,
+      "loss": 0.7585,
+      "step": 2420
+    },
+    {
+      "epoch": 2.0326223337515685,
+      "grad_norm": 0.6850367784500122,
+      "learning_rate": 0.0002,
+      "loss": 0.787,
+      "step": 2430
+    },
+    {
+      "epoch": 2.040987034713509,
+      "grad_norm": 0.9690644145011902,
+      "learning_rate": 0.0002,
+      "loss": 0.7446,
+      "step": 2440
+    },
+    {
+      "epoch": 2.0493517356754496,
+      "grad_norm": 1.1414319276809692,
+      "learning_rate": 0.0002,
+      "loss": 0.7705,
+      "step": 2450
+    },
+    {
+      "epoch": 2.0577164366373903,
+      "grad_norm": 0.9193547964096069,
+      "learning_rate": 0.0002,
+      "loss": 0.7652,
+      "step": 2460
+    },
+    {
+      "epoch": 2.0660811375993307,
+      "grad_norm": 1.358508825302124,
+      "learning_rate": 0.0002,
+      "loss": 0.8316,
+      "step": 2470
+    },
+    {
+      "epoch": 2.0744458385612714,
+      "grad_norm": 0.863886296749115,
+      "learning_rate": 0.0002,
+      "loss": 0.8208,
+      "step": 2480
+    },
+    {
+      "epoch": 2.082810539523212,
+      "grad_norm": 1.2565582990646362,
+      "learning_rate": 0.0002,
+      "loss": 0.7898,
+      "step": 2490
+    },
+    {
+      "epoch": 2.0911752404851525,
+      "grad_norm": 1.3817089796066284,
+      "learning_rate": 0.0002,
+      "loss": 0.7448,
+      "step": 2500
+    },
+    {
+      "epoch": 2.099539941447093,
+      "grad_norm": 1.2113038301467896,
+      "learning_rate": 0.0002,
+      "loss": 0.7404,
+      "step": 2510
+    },
+    {
+      "epoch": 2.107904642409034,
+      "grad_norm": 0.7887806296348572,
+      "learning_rate": 0.0002,
+      "loss": 0.7189,
+      "step": 2520
+    },
+    {
+      "epoch": 2.1162693433709743,
+      "grad_norm": 0.8972041010856628,
+      "learning_rate": 0.0002,
+      "loss": 0.7905,
+      "step": 2530
+    },
+    {
+      "epoch": 2.124634044332915,
+      "grad_norm": 1.3625597953796387,
+      "learning_rate": 0.0002,
+      "loss": 0.6934,
+      "step": 2540
+    },
+    {
+      "epoch": 2.132998745294856,
+      "grad_norm": 1.819085955619812,
+      "learning_rate": 0.0002,
+      "loss": 0.7833,
+      "step": 2550
+    },
+    {
+      "epoch": 2.141363446256796,
+      "grad_norm": 0.965623140335083,
+      "learning_rate": 0.0002,
+      "loss": 0.6858,
+      "step": 2560
+    },
+    {
+      "epoch": 2.149728147218737,
+      "grad_norm": 0.8528746366500854,
+      "learning_rate": 0.0002,
+      "loss": 0.7498,
+      "step": 2570
+    },
+    {
+      "epoch": 2.1580928481806776,
+      "grad_norm": 0.8238094449043274,
+      "learning_rate": 0.0002,
+      "loss": 0.8329,
+      "step": 2580
+    },
+    {
+      "epoch": 2.1664575491426183,
+      "grad_norm": 0.9206092953681946,
+      "learning_rate": 0.0002,
+      "loss": 0.8329,
+      "step": 2590
+    },
+    {
+      "epoch": 2.1748222501045587,
+      "grad_norm": 1.3594036102294922,
+      "learning_rate": 0.0002,
+      "loss": 0.8068,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1831869510664994,
+      "grad_norm": 0.9997738599777222,
+      "learning_rate": 0.0002,
+      "loss": 0.717,
+      "step": 2610
+    },
+    {
+      "epoch": 2.19155165202844,
+      "grad_norm": 0.9230810403823853,
+      "learning_rate": 0.0002,
+      "loss": 0.8211,
+      "step": 2620
+    },
+    {
+      "epoch": 2.1999163529903805,
+      "grad_norm": 0.859367311000824,
+      "learning_rate": 0.0002,
+      "loss": 0.8048,
+      "step": 2630
+    },
+    {
+      "epoch": 2.208281053952321,
+      "grad_norm": 1.087170958518982,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 2640
+    },
+    {
+      "epoch": 2.216645754914262,
+      "grad_norm": 0.8764513731002808,
+      "learning_rate": 0.0002,
+      "loss": 0.7623,
+      "step": 2650
+    },
+    {
+      "epoch": 2.2250104558762023,
+      "grad_norm": 1.4553709030151367,
+      "learning_rate": 0.0002,
+      "loss": 0.7349,
+      "step": 2660
+    },
+    {
+      "epoch": 2.233375156838143,
+      "grad_norm": 0.8835197687149048,
+      "learning_rate": 0.0002,
+      "loss": 0.768,
+      "step": 2670
+    },
+    {
+      "epoch": 2.241739857800084,
+      "grad_norm": 3.089097023010254,
+      "learning_rate": 0.0002,
+      "loss": 0.8307,
+      "step": 2680
+    },
+    {
+      "epoch": 2.250104558762024,
+      "grad_norm": 1.1077880859375,
+      "learning_rate": 0.0002,
+      "loss": 0.7848,
+      "step": 2690
+    },
+    {
+      "epoch": 2.258469259723965,
+      "grad_norm": 0.99500972032547,
+      "learning_rate": 0.0002,
+      "loss": 0.8162,
+      "step": 2700
+    },
+    {
+      "epoch": 2.2668339606859056,
+      "grad_norm": 1.1205966472625732,
+      "learning_rate": 0.0002,
+      "loss": 0.8746,
+      "step": 2710
+    },
+    {
+      "epoch": 2.275198661647846,
+      "grad_norm": 1.661110520362854,
+      "learning_rate": 0.0002,
+      "loss": 0.7618,
+      "step": 2720
+    },
+    {
+      "epoch": 2.2835633626097867,
+      "grad_norm": 0.8378655910491943,
+      "learning_rate": 0.0002,
+      "loss": 0.7953,
+      "step": 2730
+    },
+    {
+      "epoch": 2.2919280635717274,
+      "grad_norm": 0.896804690361023,
+      "learning_rate": 0.0002,
+      "loss": 0.8631,
+      "step": 2740
+    },
+    {
+      "epoch": 2.300292764533668,
+      "grad_norm": 1.3855854272842407,
+      "learning_rate": 0.0002,
+      "loss": 0.8486,
+      "step": 2750
+    },
+    {
+      "epoch": 2.3086574654956085,
+      "grad_norm": 1.2021458148956299,
+      "learning_rate": 0.0002,
+      "loss": 0.7588,
+      "step": 2760
+    },
+    {
+      "epoch": 2.317022166457549,
+      "grad_norm": 0.899120032787323,
+      "learning_rate": 0.0002,
+      "loss": 0.7919,
+      "step": 2770
+    },
+    {
+      "epoch": 2.32538686741949,
+      "grad_norm": 1.828062653541565,
+      "learning_rate": 0.0002,
+      "loss": 0.7732,
+      "step": 2780
+    },
+    {
+      "epoch": 2.3337515683814303,
+      "grad_norm": 0.9991434216499329,
+      "learning_rate": 0.0002,
+      "loss": 0.8045,
+      "step": 2790
+    },
+    {
+      "epoch": 2.342116269343371,
+      "grad_norm": 1.0790653228759766,
+      "learning_rate": 0.0002,
+      "loss": 0.7822,
+      "step": 2800
+    },
+    {
+      "epoch": 2.350480970305312,
+      "grad_norm": 1.0230239629745483,
+      "learning_rate": 0.0002,
+      "loss": 0.8078,
+      "step": 2810
+    },
+    {
+      "epoch": 2.358845671267252,
+      "grad_norm": 1.5933716297149658,
+      "learning_rate": 0.0002,
+      "loss": 0.7708,
+      "step": 2820
+    },
+    {
+      "epoch": 2.367210372229193,
+      "grad_norm": 0.9329839944839478,
+      "learning_rate": 0.0002,
+      "loss": 0.8118,
+      "step": 2830
+    },
+    {
+      "epoch": 2.3755750731911336,
+      "grad_norm": 1.3260419368743896,
+      "learning_rate": 0.0002,
+      "loss": 0.9203,
+      "step": 2840
+    },
+    {
+      "epoch": 2.383939774153074,
+      "grad_norm": 1.5849716663360596,
+      "learning_rate": 0.0002,
+      "loss": 0.8069,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3923044751150147,
+      "grad_norm": 1.3823819160461426,
+      "learning_rate": 0.0002,
+      "loss": 0.7742,
+      "step": 2860
+    },
+    {
+      "epoch": 2.4006691760769554,
+      "grad_norm": 1.9762850999832153,
+      "learning_rate": 0.0002,
+      "loss": 0.7546,
+      "step": 2870
+    },
+    {
+      "epoch": 2.4090338770388957,
+      "grad_norm": 1.26347017288208,
+      "learning_rate": 0.0002,
+      "loss": 0.7905,
+      "step": 2880
+    },
+    {
+      "epoch": 2.4173985780008365,
+      "grad_norm": 1.3072547912597656,
+      "learning_rate": 0.0002,
+      "loss": 0.7065,
+      "step": 2890
+    },
+    {
+      "epoch": 2.425763278962777,
+      "grad_norm": 0.9560710787773132,
+      "learning_rate": 0.0002,
+      "loss": 0.787,
+      "step": 2900
+    },
+    {
+      "epoch": 2.4341279799247175,
+      "grad_norm": 1.218003749847412,
+      "learning_rate": 0.0002,
+      "loss": 0.7588,
+      "step": 2910
+    },
+    {
+      "epoch": 2.4424926808866583,
+      "grad_norm": 1.2331637144088745,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 2920
+    },
+    {
+      "epoch": 2.450857381848599,
+      "grad_norm": 0.8522817492485046,
+      "learning_rate": 0.0002,
+      "loss": 0.7584,
+      "step": 2930
+    },
+    {
+      "epoch": 2.4592220828105393,
+      "grad_norm": 1.088078260421753,
+      "learning_rate": 0.0002,
+      "loss": 0.8426,
+      "step": 2940
+    },
+    {
+      "epoch": 2.46758678377248,
+      "grad_norm": 1.8097302913665771,
+      "learning_rate": 0.0002,
+      "loss": 0.8165,
+      "step": 2950
+    },
+    {
+      "epoch": 2.475951484734421,
+      "grad_norm": 1.3536505699157715,
+      "learning_rate": 0.0002,
+      "loss": 0.7653,
+      "step": 2960
+    },
+    {
+      "epoch": 2.484316185696361,
+      "grad_norm": 1.1186577081680298,
+      "learning_rate": 0.0002,
+      "loss": 0.8158,
+      "step": 2970
+    },
+    {
+      "epoch": 2.492680886658302,
+      "grad_norm": 0.9760162234306335,
+      "learning_rate": 0.0002,
+      "loss": 0.8035,
+      "step": 2980
+    },
+    {
+      "epoch": 2.5010455876202426,
+      "grad_norm": 0.8375564217567444,
+      "learning_rate": 0.0002,
+      "loss": 0.7839,
+      "step": 2990
+    },
+    {
+      "epoch": 2.509410288582183,
+      "grad_norm": 1.2951868772506714,
+      "learning_rate": 0.0002,
+      "loss": 0.8406,
+      "step": 3000
+    },
+    {
+      "epoch": 2.5177749895441237,
+      "grad_norm": 1.4565622806549072,
+      "learning_rate": 0.0002,
+      "loss": 0.7518,
+      "step": 3010
+    },
+    {
+      "epoch": 2.5261396905060645,
+      "grad_norm": 1.3650271892547607,
+      "learning_rate": 0.0002,
+      "loss": 0.7281,
+      "step": 3020
+    },
+    {
+      "epoch": 2.5345043914680048,
+      "grad_norm": 0.9309256076812744,
+      "learning_rate": 0.0002,
+      "loss": 0.7661,
+      "step": 3030
+    },
+    {
+      "epoch": 2.5428690924299455,
+      "grad_norm": 1.07468581199646,
+      "learning_rate": 0.0002,
+      "loss": 0.8193,
+      "step": 3040
+    },
+    {
+      "epoch": 2.5512337933918863,
+      "grad_norm": 1.3080440759658813,
+      "learning_rate": 0.0002,
+      "loss": 0.7766,
+      "step": 3050
+    },
+    {
+      "epoch": 2.5595984943538266,
+      "grad_norm": 1.1031043529510498,
+      "learning_rate": 0.0002,
+      "loss": 0.7574,
+      "step": 3060
+    },
+    {
+      "epoch": 2.5679631953157673,
+      "grad_norm": 1.1987742185592651,
+      "learning_rate": 0.0002,
+      "loss": 0.8079,
+      "step": 3070
+    },
+    {
+      "epoch": 2.576327896277708,
+      "grad_norm": 1.0586541891098022,
+      "learning_rate": 0.0002,
+      "loss": 0.7845,
+      "step": 3080
+    },
+    {
+      "epoch": 2.584692597239649,
+      "grad_norm": 1.102643370628357,
+      "learning_rate": 0.0002,
+      "loss": 0.717,
+      "step": 3090
+    },
+    {
+      "epoch": 2.593057298201589,
+      "grad_norm": 0.9427294731140137,
+      "learning_rate": 0.0002,
+      "loss": 0.7393,
+      "step": 3100
+    },
+    {
+      "epoch": 2.60142199916353,
+      "grad_norm": 1.0983883142471313,
+      "learning_rate": 0.0002,
+      "loss": 0.7737,
+      "step": 3110
+    },
+    {
+      "epoch": 2.6097867001254706,
+      "grad_norm": 1.0884950160980225,
+      "learning_rate": 0.0002,
+      "loss": 0.8082,
+      "step": 3120
+    },
+    {
+      "epoch": 2.6181514010874114,
+      "grad_norm": 0.7318823933601379,
+      "learning_rate": 0.0002,
+      "loss": 0.8692,
+      "step": 3130
+    },
+    {
+      "epoch": 2.6265161020493517,
+      "grad_norm": 1.191433310508728,
+      "learning_rate": 0.0002,
+      "loss": 0.8717,
+      "step": 3140
+    },
+    {
+      "epoch": 2.6348808030112925,
+      "grad_norm": 1.2203903198242188,
+      "learning_rate": 0.0002,
+      "loss": 0.7807,
+      "step": 3150
+    },
+    {
+      "epoch": 2.643245503973233,
+      "grad_norm": 0.9203769564628601,
+      "learning_rate": 0.0002,
+      "loss": 0.8545,
+      "step": 3160
+    },
+    {
+      "epoch": 2.6516102049351735,
+      "grad_norm": 1.298902988433838,
+      "learning_rate": 0.0002,
+      "loss": 0.8506,
+      "step": 3170
+    },
+    {
+      "epoch": 2.6599749058971143,
+      "grad_norm": 1.1898419857025146,
+      "learning_rate": 0.0002,
+      "loss": 0.7738,
+      "step": 3180
+    },
+    {
+      "epoch": 2.668339606859055,
+      "grad_norm": 1.0493841171264648,
+      "learning_rate": 0.0002,
+      "loss": 0.7995,
+      "step": 3190
+    },
+    {
+      "epoch": 2.6767043078209953,
+      "grad_norm": 1.242353081703186,
+      "learning_rate": 0.0002,
+      "loss": 0.7892,
+      "step": 3200
+    },
+    {
+      "epoch": 2.685069008782936,
+      "grad_norm": 0.9045358896255493,
+      "learning_rate": 0.0002,
+      "loss": 0.7538,
+      "step": 3210
+    },
+    {
+      "epoch": 2.693433709744877,
+      "grad_norm": 0.8066250085830688,
+      "learning_rate": 0.0002,
+      "loss": 0.8079,
+      "step": 3220
+    },
+    {
+      "epoch": 2.701798410706817,
+      "grad_norm": 1.0554566383361816,
+      "learning_rate": 0.0002,
+      "loss": 0.8163,
+      "step": 3230
+    },
+    {
+      "epoch": 2.710163111668758,
+      "grad_norm": 1.2041901350021362,
+      "learning_rate": 0.0002,
+      "loss": 0.8311,
+      "step": 3240
+    },
+    {
+      "epoch": 2.7185278126306986,
+      "grad_norm": 0.882590651512146,
+      "learning_rate": 0.0002,
+      "loss": 0.7968,
+      "step": 3250
+    },
+    {
+      "epoch": 2.726892513592639,
+      "grad_norm": 1.0998138189315796,
+      "learning_rate": 0.0002,
+      "loss": 0.7699,
+      "step": 3260
+    },
+    {
+      "epoch": 2.7352572145545797,
+      "grad_norm": 0.996216356754303,
+      "learning_rate": 0.0002,
+      "loss": 0.6923,
+      "step": 3270
+    },
+    {
+      "epoch": 2.7436219155165205,
+      "grad_norm": 0.8555099368095398,
+      "learning_rate": 0.0002,
+      "loss": 0.7935,
+      "step": 3280
+    },
+    {
+      "epoch": 2.7519866164784608,
+      "grad_norm": 1.1827069520950317,
+      "learning_rate": 0.0002,
+      "loss": 0.7199,
+      "step": 3290
+    },
+    {
+      "epoch": 2.7603513174404015,
+      "grad_norm": 1.0703648328781128,
+      "learning_rate": 0.0002,
+      "loss": 0.8036,
+      "step": 3300
+    },
+    {
+      "epoch": 2.7687160184023423,
+      "grad_norm": 1.3048015832901,
+      "learning_rate": 0.0002,
+      "loss": 0.8174,
+      "step": 3310
+    },
+    {
+      "epoch": 2.7770807193642826,
+      "grad_norm": 1.169602632522583,
+      "learning_rate": 0.0002,
+      "loss": 0.8535,
+      "step": 3320
+    },
+    {
+      "epoch": 2.7854454203262233,
+      "grad_norm": 1.2847896814346313,
+      "learning_rate": 0.0002,
+      "loss": 0.8343,
+      "step": 3330
+    },
+    {
+      "epoch": 2.793810121288164,
+      "grad_norm": 1.2243924140930176,
+      "learning_rate": 0.0002,
+      "loss": 0.855,
+      "step": 3340
+    },
+    {
+      "epoch": 2.8021748222501044,
+      "grad_norm": 0.8506208658218384,
+      "learning_rate": 0.0002,
+      "loss": 0.6935,
+      "step": 3350
+    },
+    {
+      "epoch": 2.810539523212045,
+      "grad_norm": 1.4431512355804443,
+      "learning_rate": 0.0002,
+      "loss": 0.765,
+      "step": 3360
+    },
+    {
+      "epoch": 2.818904224173986,
+      "grad_norm": 1.4266667366027832,
+      "learning_rate": 0.0002,
+      "loss": 0.7762,
+      "step": 3370
+    },
+    {
+      "epoch": 2.827268925135926,
+      "grad_norm": 0.9062533378601074,
+      "learning_rate": 0.0002,
+      "loss": 0.7494,
+      "step": 3380
+    },
+    {
+      "epoch": 2.835633626097867,
+      "grad_norm": 1.1310938596725464,
+      "learning_rate": 0.0002,
+      "loss": 0.7772,
+      "step": 3390
+    },
+    {
+      "epoch": 2.8439983270598077,
+      "grad_norm": 1.0112131834030151,
+      "learning_rate": 0.0002,
+      "loss": 0.7389,
+      "step": 3400
+    },
+    {
+      "epoch": 2.852363028021748,
+      "grad_norm": 2.258208751678467,
+      "learning_rate": 0.0002,
+      "loss": 0.8284,
+      "step": 3410
+    },
+    {
+      "epoch": 2.8607277289836888,
+      "grad_norm": 0.8820013403892517,
+      "learning_rate": 0.0002,
+      "loss": 0.8338,
+      "step": 3420
+    },
+    {
+      "epoch": 2.8690924299456295,
+      "grad_norm": 1.142311453819275,
+      "learning_rate": 0.0002,
+      "loss": 0.805,
+      "step": 3430
+    },
+    {
+      "epoch": 2.87745713090757,
+      "grad_norm": 1.8235642910003662,
+      "learning_rate": 0.0002,
+      "loss": 0.8523,
+      "step": 3440
+    },
+    {
+      "epoch": 2.8858218318695106,
+      "grad_norm": 1.963779330253601,
+      "learning_rate": 0.0002,
+      "loss": 0.783,
+      "step": 3450
+    },
+    {
+      "epoch": 2.8941865328314513,
+      "grad_norm": 1.1153792142868042,
+      "learning_rate": 0.0002,
+      "loss": 0.8492,
+      "step": 3460
+    },
+    {
+      "epoch": 2.9025512337933916,
+      "grad_norm": 1.216044545173645,
+      "learning_rate": 0.0002,
+      "loss": 0.8288,
+      "step": 3470
+    },
+    {
+      "epoch": 2.9109159347553324,
+      "grad_norm": 1.0727925300598145,
+      "learning_rate": 0.0002,
+      "loss": 0.7853,
+      "step": 3480
+    },
+    {
+      "epoch": 2.919280635717273,
+      "grad_norm": 1.4920282363891602,
+      "learning_rate": 0.0002,
+      "loss": 0.7624,
+      "step": 3490
+    },
+    {
+      "epoch": 2.9276453366792135,
+      "grad_norm": 2.0315160751342773,
+      "learning_rate": 0.0002,
+      "loss": 0.8183,
+      "step": 3500
+    },
+    {
+      "epoch": 2.936010037641154,
+      "grad_norm": 1.0698133707046509,
+      "learning_rate": 0.0002,
+      "loss": 0.8094,
+      "step": 3510
+    },
+    {
+      "epoch": 2.944374738603095,
+      "grad_norm": 1.2448259592056274,
+      "learning_rate": 0.0002,
+      "loss": 0.8727,
+      "step": 3520
+    },
+    {
+      "epoch": 2.9527394395650357,
+      "grad_norm": 0.9577398896217346,
+      "learning_rate": 0.0002,
+      "loss": 0.8672,
+      "step": 3530
+    },
+    {
+      "epoch": 2.961104140526976,
+      "grad_norm": 1.1637893915176392,
+      "learning_rate": 0.0002,
+      "loss": 0.8408,
+      "step": 3540
+    },
+    {
+      "epoch": 2.9694688414889168,
+      "grad_norm": 1.5379204750061035,
+      "learning_rate": 0.0002,
+      "loss": 0.719,
+      "step": 3550
+    },
+    {
+      "epoch": 2.9778335424508575,
+      "grad_norm": 1.3025894165039062,
+      "learning_rate": 0.0002,
+      "loss": 0.8086,
+      "step": 3560
+    },
+    {
+      "epoch": 2.9861982434127983,
+      "grad_norm": 1.049248456954956,
+      "learning_rate": 0.0002,
+      "loss": 0.8768,
+      "step": 3570
+    },
+    {
+      "epoch": 2.9945629443747386,
+      "grad_norm": 1.524281620979309,
+      "learning_rate": 0.0002,
+      "loss": 0.7704,
+      "step": 3580
+    },
+    {
+      "epoch": 2.9995817649519028,
+      "eval_loss": 1.2029013633728027,
+      "eval_runtime": 33.4084,
+      "eval_samples_per_second": 13.649,
+      "eval_steps_per_second": 1.706,
+      "step": 3586
+    },
+    {
+      "epoch": 3.0029276453366793,
+      "grad_norm": 1.1627998352050781,
+      "learning_rate": 0.0002,
+      "loss": 0.7749,
+      "step": 3590
+    },
+    {
+      "epoch": 3.0112923462986196,
+      "grad_norm": 1.1400229930877686,
+      "learning_rate": 0.0002,
+      "loss": 0.6608,
+      "step": 3600
+    },
+    {
+      "epoch": 3.0196570472605604,
+      "grad_norm": 1.3419189453125,
+      "learning_rate": 0.0002,
+      "loss": 0.6045,
+      "step": 3610
+    },
+    {
+      "epoch": 3.028021748222501,
+      "grad_norm": 1.1323829889297485,
+      "learning_rate": 0.0002,
+      "loss": 0.5907,
+      "step": 3620
+    },
+    {
+      "epoch": 3.0363864491844414,
+      "grad_norm": 1.478824257850647,
+      "learning_rate": 0.0002,
+      "loss": 0.5544,
+      "step": 3630
+    },
+    {
+      "epoch": 3.044751150146382,
+      "grad_norm": 1.032449722290039,
+      "learning_rate": 0.0002,
+      "loss": 0.6039,
+      "step": 3640
+    },
+    {
+      "epoch": 3.053115851108323,
+      "grad_norm": 1.4655747413635254,
+      "learning_rate": 0.0002,
+      "loss": 0.6394,
+      "step": 3650
+    },
+    {
+      "epoch": 3.0614805520702637,
+      "grad_norm": 1.3922516107559204,
+      "learning_rate": 0.0002,
+      "loss": 0.575,
+      "step": 3660
+    },
+    {
+      "epoch": 3.069845253032204,
+      "grad_norm": 1.2919706106185913,
+      "learning_rate": 0.0002,
+      "loss": 0.5575,
+      "step": 3670
+    },
+    {
+      "epoch": 3.0782099539941448,
+      "grad_norm": 1.1615499258041382,
+      "learning_rate": 0.0002,
+      "loss": 0.63,
+      "step": 3680
+    },
+    {
+      "epoch": 3.0865746549560855,
+      "grad_norm": 1.798956036567688,
+      "learning_rate": 0.0002,
+      "loss": 0.6093,
+      "step": 3690
+    },
+    {
+      "epoch": 3.094939355918026,
+      "grad_norm": 1.729689359664917,
+      "learning_rate": 0.0002,
+      "loss": 0.621,
+      "step": 3700
+    },
+    {
+      "epoch": 3.1033040568799666,
+      "grad_norm": 1.3400843143463135,
+      "learning_rate": 0.0002,
+      "loss": 0.65,
+      "step": 3710
+    },
+    {
+      "epoch": 3.1116687578419073,
+      "grad_norm": 1.0873488187789917,
+      "learning_rate": 0.0002,
+      "loss": 0.5644,
+      "step": 3720
+    },
+    {
+      "epoch": 3.1200334588038476,
+      "grad_norm": 1.2117328643798828,
+      "learning_rate": 0.0002,
+      "loss": 0.6086,
+      "step": 3730
+    },
+    {
+      "epoch": 3.1283981597657884,
+      "grad_norm": 1.4171959161758423,
+      "learning_rate": 0.0002,
+      "loss": 0.6118,
+      "step": 3740
+    },
+    {
+      "epoch": 3.136762860727729,
+      "grad_norm": 1.4323008060455322,
+      "learning_rate": 0.0002,
+      "loss": 0.6034,
+      "step": 3750
+    },
+    {
+      "epoch": 3.1451275616896694,
+      "grad_norm": 1.2421602010726929,
+      "learning_rate": 0.0002,
+      "loss": 0.5944,
+      "step": 3760
+    },
+    {
+      "epoch": 3.15349226265161,
+      "grad_norm": 2.0923776626586914,
+      "learning_rate": 0.0002,
+      "loss": 0.5363,
+      "step": 3770
+    },
+    {
+      "epoch": 3.161856963613551,
+      "grad_norm": 1.2325282096862793,
+      "learning_rate": 0.0002,
+      "loss": 0.6052,
+      "step": 3780
+    },
+    {
+      "epoch": 3.1702216645754913,
+      "grad_norm": 1.083258867263794,
+      "learning_rate": 0.0002,
+      "loss": 0.5722,
+      "step": 3790
+    },
+    {
+      "epoch": 3.178586365537432,
+      "grad_norm": 1.3633801937103271,
+      "learning_rate": 0.0002,
+      "loss": 0.6227,
+      "step": 3800
+    },
+    {
+      "epoch": 3.1869510664993728,
+      "grad_norm": 1.137007236480713,
+      "learning_rate": 0.0002,
+      "loss": 0.6082,
+      "step": 3810
+    },
+    {
+      "epoch": 3.195315767461313,
+      "grad_norm": 1.395223617553711,
+      "learning_rate": 0.0002,
+      "loss": 0.6484,
+      "step": 3820
+    },
+    {
+      "epoch": 3.203680468423254,
+      "grad_norm": 1.5808054208755493,
+      "learning_rate": 0.0002,
+      "loss": 0.6014,
+      "step": 3830
+    },
+    {
+      "epoch": 3.2120451693851946,
+      "grad_norm": 1.615455150604248,
+      "learning_rate": 0.0002,
+      "loss": 0.6249,
+      "step": 3840
+    },
+    {
+      "epoch": 3.220409870347135,
+      "grad_norm": 1.711955189704895,
+      "learning_rate": 0.0002,
+      "loss": 0.5868,
+      "step": 3850
+    },
+    {
+      "epoch": 3.2287745713090756,
+      "grad_norm": 1.0368887186050415,
+      "learning_rate": 0.0002,
+      "loss": 0.618,
+      "step": 3860
+    },
+    {
+      "epoch": 3.2371392722710164,
+      "grad_norm": 1.0486373901367188,
+      "learning_rate": 0.0002,
+      "loss": 0.5826,
+      "step": 3870
+    },
+    {
+      "epoch": 3.2455039732329567,
+      "grad_norm": 1.1221380233764648,
+      "learning_rate": 0.0002,
+      "loss": 0.7023,
+      "step": 3880
+    },
+    {
+      "epoch": 3.2538686741948974,
+      "grad_norm": 1.8587424755096436,
+      "learning_rate": 0.0002,
+      "loss": 0.6442,
+      "step": 3890
+    },
+    {
+      "epoch": 3.262233375156838,
+      "grad_norm": 1.3617318868637085,
+      "learning_rate": 0.0002,
+      "loss": 0.6113,
+      "step": 3900
+    },
+    {
+      "epoch": 3.2705980761187785,
+      "grad_norm": 1.2195371389389038,
+      "learning_rate": 0.0002,
+      "loss": 0.6062,
+      "step": 3910
+    },
+    {
+      "epoch": 3.2789627770807193,
+      "grad_norm": 1.46000075340271,
+      "learning_rate": 0.0002,
+      "loss": 0.5785,
+      "step": 3920
+    },
+    {
+      "epoch": 3.28732747804266,
+      "grad_norm": 1.7811700105667114,
+      "learning_rate": 0.0002,
+      "loss": 0.6824,
+      "step": 3930
+    },
+    {
+      "epoch": 3.2956921790046008,
+      "grad_norm": 1.5718696117401123,
+      "learning_rate": 0.0002,
+      "loss": 0.668,
+      "step": 3940
+    },
+    {
+      "epoch": 3.304056879966541,
+      "grad_norm": 0.8652304410934448,
+      "learning_rate": 0.0002,
+      "loss": 0.5484,
+      "step": 3950
+    },
+    {
+      "epoch": 3.312421580928482,
+      "grad_norm": 1.4029780626296997,
+      "learning_rate": 0.0002,
+      "loss": 0.6176,
+      "step": 3960
+    },
+    {
+      "epoch": 3.3207862818904226,
+      "grad_norm": 1.3114005327224731,
+      "learning_rate": 0.0002,
+      "loss": 0.6327,
+      "step": 3970
+    },
+    {
+      "epoch": 3.329150982852363,
+      "grad_norm": 1.3586502075195312,
+      "learning_rate": 0.0002,
+      "loss": 0.6622,
+      "step": 3980
+    },
+    {
+      "epoch": 3.3375156838143036,
+      "grad_norm": 1.3389734029769897,
+      "learning_rate": 0.0002,
+      "loss": 0.6943,
+      "step": 3990
+    },
+    {
+      "epoch": 3.3458803847762444,
+      "grad_norm": 1.3388065099716187,
+      "learning_rate": 0.0002,
+      "loss": 0.6197,
+      "step": 4000
+    },
+    {
+      "epoch": 3.3542450857381847,
+      "grad_norm": 1.2245055437088013,
+      "learning_rate": 0.0002,
+      "loss": 0.5369,
+      "step": 4010
+    },
+    {
+      "epoch": 3.3626097867001254,
+      "grad_norm": 1.2442208528518677,
+      "learning_rate": 0.0002,
+      "loss": 0.598,
+      "step": 4020
+    },
+    {
+      "epoch": 3.370974487662066,
+      "grad_norm": 1.5834486484527588,
+      "learning_rate": 0.0002,
+      "loss": 0.6926,
+      "step": 4030
+    },
+    {
+      "epoch": 3.3793391886240065,
+      "grad_norm": 1.1468696594238281,
+      "learning_rate": 0.0002,
+      "loss": 0.6695,
+      "step": 4040
+    },
+    {
+      "epoch": 3.3877038895859473,
+      "grad_norm": 1.692815899848938,
+      "learning_rate": 0.0002,
+      "loss": 0.6022,
+      "step": 4050
+    },
+    {
+      "epoch": 3.396068590547888,
+      "grad_norm": 1.3289376497268677,
+      "learning_rate": 0.0002,
+      "loss": 0.635,
+      "step": 4060
+    },
+    {
+      "epoch": 3.4044332915098288,
+      "grad_norm": 1.3257668018341064,
+      "learning_rate": 0.0002,
+      "loss": 0.6816,
+      "step": 4070
+    },
+    {
+      "epoch": 3.412797992471769,
+      "grad_norm": 1.664469838142395,
+      "learning_rate": 0.0002,
+      "loss": 0.6283,
+      "step": 4080
+    },
+    {
+      "epoch": 3.42116269343371,
+      "grad_norm": 1.7101385593414307,
+      "learning_rate": 0.0002,
+      "loss": 0.5756,
+      "step": 4090
+    },
+    {
+      "epoch": 3.4295273943956506,
+      "grad_norm": 1.135727882385254,
+      "learning_rate": 0.0002,
+      "loss": 0.6101,
+      "step": 4100
+    },
+    {
+      "epoch": 3.437892095357591,
+      "grad_norm": 1.2059178352355957,
+      "learning_rate": 0.0002,
+      "loss": 0.6003,
+      "step": 4110
+    },
+    {
+      "epoch": 3.4462567963195316,
+      "grad_norm": 2.1606826782226562,
+      "learning_rate": 0.0002,
+      "loss": 0.6753,
+      "step": 4120
+    },
+    {
+      "epoch": 3.4546214972814724,
+      "grad_norm": 1.7403228282928467,
+      "learning_rate": 0.0002,
+      "loss": 0.6144,
+      "step": 4130
+    },
+    {
+      "epoch": 3.4629861982434127,
+      "grad_norm": 1.403611183166504,
+      "learning_rate": 0.0002,
+      "loss": 0.6247,
+      "step": 4140
+    },
+    {
+      "epoch": 3.4713508992053534,
+      "grad_norm": 1.2230652570724487,
+      "learning_rate": 0.0002,
+      "loss": 0.6803,
+      "step": 4150
+    },
+    {
+      "epoch": 3.479715600167294,
+      "grad_norm": 1.3496795892715454,
+      "learning_rate": 0.0002,
+      "loss": 0.6271,
+      "step": 4160
+    },
+    {
+      "epoch": 3.4880803011292345,
+      "grad_norm": 1.8366918563842773,
+      "learning_rate": 0.0002,
+      "loss": 0.6497,
+      "step": 4170
+    },
+    {
+      "epoch": 3.4964450020911753,
+      "grad_norm": 2.147662878036499,
+      "learning_rate": 0.0002,
+      "loss": 0.7142,
+      "step": 4180
+    },
+    {
+      "epoch": 3.504809703053116,
+      "grad_norm": 2.0214715003967285,
+      "learning_rate": 0.0002,
+      "loss": 0.6325,
+      "step": 4190
+    },
+    {
+      "epoch": 3.5131744040150563,
+      "grad_norm": 1.5367511510849,
+      "learning_rate": 0.0002,
+      "loss": 0.6891,
+      "step": 4200
+    },
+    {
+      "epoch": 3.521539104976997,
+      "grad_norm": 1.5371453762054443,
+      "learning_rate": 0.0002,
+      "loss": 0.6193,
+      "step": 4210
+    },
+    {
+      "epoch": 3.529903805938938,
+      "grad_norm": 1.3281495571136475,
+      "learning_rate": 0.0002,
+      "loss": 0.6702,
+      "step": 4220
+    },
+    {
+      "epoch": 3.538268506900878,
+      "grad_norm": 1.799167513847351,
+      "learning_rate": 0.0002,
+      "loss": 0.6681,
+      "step": 4230
+    },
+    {
+      "epoch": 3.546633207862819,
+      "grad_norm": 1.1684885025024414,
+      "learning_rate": 0.0002,
+      "loss": 0.6718,
+      "step": 4240
+    },
+    {
+      "epoch": 3.5549979088247596,
+      "grad_norm": 1.3189904689788818,
+      "learning_rate": 0.0002,
+      "loss": 0.6929,
+      "step": 4250
+    },
+    {
+      "epoch": 3.5633626097867,
+      "grad_norm": 1.3670802116394043,
+      "learning_rate": 0.0002,
+      "loss": 0.6328,
+      "step": 4260
+    },
+    {
+      "epoch": 3.5717273107486407,
+      "grad_norm": 1.1644766330718994,
+      "learning_rate": 0.0002,
+      "loss": 0.6373,
+      "step": 4270
+    },
+    {
+      "epoch": 3.5800920117105814,
+      "grad_norm": 1.3769445419311523,
+      "learning_rate": 0.0002,
+      "loss": 0.6629,
+      "step": 4280
+    },
+    {
+      "epoch": 3.5884567126725218,
+      "grad_norm": 2.1133229732513428,
+      "learning_rate": 0.0002,
+      "loss": 0.5844,
+      "step": 4290
+    },
+    {
+      "epoch": 3.5968214136344625,
+      "grad_norm": 1.5001072883605957,
+      "learning_rate": 0.0002,
+      "loss": 0.6676,
+      "step": 4300
+    },
+    {
+      "epoch": 3.6051861145964033,
+      "grad_norm": 1.4176048040390015,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 4310
+    },
+    {
+      "epoch": 3.6135508155583436,
+      "grad_norm": 2.443549633026123,
+      "learning_rate": 0.0002,
+      "loss": 0.662,
+      "step": 4320
+    },
+    {
+      "epoch": 3.6219155165202843,
+      "grad_norm": 1.5239051580429077,
+      "learning_rate": 0.0002,
+      "loss": 0.6344,
+      "step": 4330
+    },
+    {
+      "epoch": 3.630280217482225,
+      "grad_norm": 1.3866405487060547,
+      "learning_rate": 0.0002,
+      "loss": 0.6353,
+      "step": 4340
+    },
+    {
+      "epoch": 3.6386449184441654,
+      "grad_norm": 1.419424057006836,
+      "learning_rate": 0.0002,
+      "loss": 0.7001,
+      "step": 4350
+    },
+    {
+      "epoch": 3.647009619406106,
+      "grad_norm": 1.4565949440002441,
+      "learning_rate": 0.0002,
+      "loss": 0.6315,
+      "step": 4360
+    },
+    {
+      "epoch": 3.655374320368047,
+      "grad_norm": 1.342238426208496,
+      "learning_rate": 0.0002,
+      "loss": 0.6613,
+      "step": 4370
+    },
+    {
+      "epoch": 3.663739021329987,
+      "grad_norm": 1.408616542816162,
+      "learning_rate": 0.0002,
+      "loss": 0.6545,
+      "step": 4380
+    },
+    {
+      "epoch": 3.672103722291928,
+      "grad_norm": 1.7074459791183472,
+      "learning_rate": 0.0002,
+      "loss": 0.6958,
+      "step": 4390
+    },
+    {
+      "epoch": 3.6804684232538687,
+      "grad_norm": 1.4587712287902832,
+      "learning_rate": 0.0002,
+      "loss": 0.6938,
+      "step": 4400
+    },
+    {
+      "epoch": 3.6888331242158094,
+      "grad_norm": 1.318400263786316,
+      "learning_rate": 0.0002,
+      "loss": 0.6464,
+      "step": 4410
+    },
+    {
+      "epoch": 3.6971978251777498,
+      "grad_norm": 1.684881567955017,
+      "learning_rate": 0.0002,
+      "loss": 0.6061,
+      "step": 4420
+    },
+    {
+      "epoch": 3.7055625261396905,
+      "grad_norm": 1.6252070665359497,
+      "learning_rate": 0.0002,
+      "loss": 0.6719,
+      "step": 4430
+    },
+    {
+      "epoch": 3.7139272271016313,
+      "grad_norm": 1.296367883682251,
+      "learning_rate": 0.0002,
+      "loss": 0.6392,
+      "step": 4440
+    },
+    {
+      "epoch": 3.7222919280635716,
+      "grad_norm": 1.4503923654556274,
+      "learning_rate": 0.0002,
+      "loss": 0.7163,
+      "step": 4450
+    },
+    {
+      "epoch": 3.7306566290255123,
+      "grad_norm": 1.6230672597885132,
+      "learning_rate": 0.0002,
+      "loss": 0.6443,
+      "step": 4460
+    },
+    {
+      "epoch": 3.739021329987453,
+      "grad_norm": 1.2959922552108765,
+      "learning_rate": 0.0002,
+      "loss": 0.6438,
+      "step": 4470
+    },
+    {
+      "epoch": 3.747386030949394,
+      "grad_norm": 1.3456019163131714,
+      "learning_rate": 0.0002,
+      "loss": 0.6646,
+      "step": 4480
+    },
+    {
+      "epoch": 3.755750731911334,
+      "grad_norm": 1.8895857334136963,
+      "learning_rate": 0.0002,
+      "loss": 0.7042,
+      "step": 4490
+    },
+    {
+      "epoch": 3.764115432873275,
+      "grad_norm": 1.4293931722640991,
+      "learning_rate": 0.0002,
+      "loss": 0.7218,
+      "step": 4500
+    },
+    {
+      "epoch": 3.7724801338352156,
+      "grad_norm": 1.174730658531189,
+      "learning_rate": 0.0002,
+      "loss": 0.6997,
+      "step": 4510
+    },
+    {
+      "epoch": 3.780844834797156,
+      "grad_norm": 1.4697777032852173,
+      "learning_rate": 0.0002,
+      "loss": 0.6745,
+      "step": 4520
+    },
+    {
+      "epoch": 3.7892095357590967,
+      "grad_norm": 1.2952854633331299,
+      "learning_rate": 0.0002,
+      "loss": 0.6432,
+      "step": 4530
+    },
+    {
+      "epoch": 3.7975742367210374,
+      "grad_norm": 1.3999767303466797,
+      "learning_rate": 0.0002,
+      "loss": 0.6325,
+      "step": 4540
+    },
+    {
+      "epoch": 3.8059389376829778,
+      "grad_norm": 1.2619322538375854,
+      "learning_rate": 0.0002,
+      "loss": 0.6584,
+      "step": 4550
+    },
+    {
+      "epoch": 3.8143036386449185,
+      "grad_norm": 1.6904349327087402,
+      "learning_rate": 0.0002,
+      "loss": 0.6187,
+      "step": 4560
+    },
+    {
+      "epoch": 3.8226683396068593,
+      "grad_norm": 2.0094497203826904,
+      "learning_rate": 0.0002,
+      "loss": 0.7026,
+      "step": 4570
+    },
+    {
+      "epoch": 3.8310330405687996,
+      "grad_norm": 1.808794617652893,
+      "learning_rate": 0.0002,
+      "loss": 0.6606,
+      "step": 4580
+    },
+    {
+      "epoch": 3.8393977415307403,
+      "grad_norm": 1.2285358905792236,
+      "learning_rate": 0.0002,
+      "loss": 0.6602,
+      "step": 4590
+    },
+    {
+      "epoch": 3.847762442492681,
+      "grad_norm": 1.6204124689102173,
+      "learning_rate": 0.0002,
+      "loss": 0.6935,
+      "step": 4600
+    },
+    {
+      "epoch": 3.8561271434546214,
+      "grad_norm": 1.5913485288619995,
+      "learning_rate": 0.0002,
+      "loss": 0.6851,
+      "step": 4610
+    },
+    {
+      "epoch": 3.864491844416562,
+      "grad_norm": 1.1823159456253052,
+      "learning_rate": 0.0002,
+      "loss": 0.6813,
+      "step": 4620
+    },
+    {
+      "epoch": 3.872856545378503,
+      "grad_norm": 1.567445993423462,
+      "learning_rate": 0.0002,
+      "loss": 0.6593,
+      "step": 4630
+    },
+    {
+      "epoch": 3.881221246340443,
+      "grad_norm": 1.2258212566375732,
+      "learning_rate": 0.0002,
+      "loss": 0.6726,
+      "step": 4640
+    },
+    {
+      "epoch": 3.889585947302384,
+      "grad_norm": 1.3130079507827759,
+      "learning_rate": 0.0002,
+      "loss": 0.6591,
+      "step": 4650
+    },
+    {
+      "epoch": 3.8979506482643247,
+      "grad_norm": 1.1151717901229858,
+      "learning_rate": 0.0002,
+      "loss": 0.6029,
+      "step": 4660
+    },
+    {
+      "epoch": 3.906315349226265,
+      "grad_norm": 1.3589006662368774,
+      "learning_rate": 0.0002,
+      "loss": 0.6707,
+      "step": 4670
+    },
+    {
+      "epoch": 3.9146800501882058,
+      "grad_norm": 1.4174816608428955,
+      "learning_rate": 0.0002,
+      "loss": 0.6128,
+      "step": 4680
+    },
+    {
+      "epoch": 3.9230447511501465,
+      "grad_norm": 1.5213950872421265,
+      "learning_rate": 0.0002,
+      "loss": 0.6464,
+      "step": 4690
+    },
+    {
+      "epoch": 3.931409452112087,
+      "grad_norm": 1.7155694961547852,
+      "learning_rate": 0.0002,
+      "loss": 0.6668,
+      "step": 4700
+    },
+    {
+      "epoch": 3.9397741530740276,
+      "grad_norm": 1.1058056354522705,
+      "learning_rate": 0.0002,
+      "loss": 0.6682,
+      "step": 4710
+    },
+    {
+      "epoch": 3.9481388540359683,
+      "grad_norm": 1.8605217933654785,
+      "learning_rate": 0.0002,
+      "loss": 0.6224,
+      "step": 4720
+    },
+    {
+      "epoch": 3.9565035549979086,
+      "grad_norm": 1.394025444984436,
+      "learning_rate": 0.0002,
+      "loss": 0.6768,
+      "step": 4730
+    },
+    {
+      "epoch": 3.9648682559598494,
+      "grad_norm": 1.1949763298034668,
+      "learning_rate": 0.0002,
+      "loss": 0.7761,
+      "step": 4740
+    },
+    {
+      "epoch": 3.97323295692179,
+      "grad_norm": 1.2274279594421387,
+      "learning_rate": 0.0002,
+      "loss": 0.6036,
+      "step": 4750
+    },
+    {
+      "epoch": 3.9815976578837304,
+      "grad_norm": 1.1581952571868896,
+      "learning_rate": 0.0002,
+      "loss": 0.6765,
+      "step": 4760
+    },
+    {
+      "epoch": 3.989962358845671,
+      "grad_norm": 1.6140263080596924,
+      "learning_rate": 0.0002,
+      "loss": 0.6943,
+      "step": 4770
+    },
+    {
+      "epoch": 3.998327059807612,
+      "grad_norm": 1.6482408046722412,
+      "learning_rate": 0.0002,
+      "loss": 0.7059,
+      "step": 4780
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.2674074172973633,
+      "eval_runtime": 33.3838,
+      "eval_samples_per_second": 13.659,
+      "eval_steps_per_second": 1.707,
+      "step": 4782
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9560,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.098089431138304e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}