MilaWang commited on Mar 28, 2025

Commit

b07093d

verified ·

1 Parent(s): 28c364f

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/trainer_state.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-11217/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/trainer_state.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-12816/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/trainer_state.json +1161 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205/trainer_state.json +2289 -0

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:287190a522caadedb08a02f1ccd3a4293855b5bb21a6e1406b24d5fc9c22b743
+size 109069176

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a16b879467aa4962daf6df56bfc297dbb403b40143508ea6bd63ba3566b6f87
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3732994643877ec72c1dd774c4f16ecb3c64d351016067becc3a10a98645dea
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e44b3efcf5ee26d89edb13f6ecee68002161b344fb43171eade3d728c556b799
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:334d83ac6549f5ce73b8fd101d0d08fb744e8627602e1e3ddcb3f0fd9a8f7718
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e76e5f8e457239ae8e9a3a140db914985c1c9359562528b6e52d41e0a60d0a11
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7200deb7676ea8557f52d3642d310dae90ef58392e10311af3e277a0e0869cca
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a76de82a05fd06892df01a7d3da2377a11b115927e78a57e9c9e3c0d8fce958
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91c9b7432d33d9e1883b622298c87293cb0ac07e0e67967c7f0a35d1cb73e3be
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f71a4070923ff2ad241fbbd1334812dad610a022ad9359c9f3decf8aa0da065
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e76e5f8e457239ae8e9a3a140db914985c1c9359562528b6e52d41e0a60d0a11
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d47a834a614c262bebc3ceaf4fe366865fe4e6ceb41907c10affc6bc301ff0
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b80bbf86af9cb96a677066a6c6dc3a496ac358a0c15ffaff57b74dd9736d933
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d40b8bfafa7b5cee4d92ea8270ad586fb923261c24dd1670eb91d6e13fdff06f
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4617a1fe68c22094d4569922304633a3bb0045830e767cfcd7abd54cb54451fa
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1161 @@

+{
+  "best_metric": 1.1827317476272583,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-1602",
+  "epoch": 0.9996879875195008,
+  "eval_steps": 10,
+  "global_step": 1602,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0062402496099844,
+      "grad_norm": 1.031922698020935,
+      "learning_rate": 0.0002,
+      "loss": 1.9624,
+      "step": 10
+    },
+    {
+      "epoch": 0.0124804992199688,
+      "grad_norm": 0.7286948561668396,
+      "learning_rate": 0.0002,
+      "loss": 1.5889,
+      "step": 20
+    },
+    {
+      "epoch": 0.0187207488299532,
+      "grad_norm": 0.7649297714233398,
+      "learning_rate": 0.0002,
+      "loss": 1.5445,
+      "step": 30
+    },
+    {
+      "epoch": 0.0249609984399376,
+      "grad_norm": 0.9243895411491394,
+      "learning_rate": 0.0002,
+      "loss": 1.6042,
+      "step": 40
+    },
+    {
+      "epoch": 0.031201248049921998,
+      "grad_norm": 1.2203439474105835,
+      "learning_rate": 0.0002,
+      "loss": 1.4839,
+      "step": 50
+    },
+    {
+      "epoch": 0.0374414976599064,
+      "grad_norm": 0.7954166531562805,
+      "learning_rate": 0.0002,
+      "loss": 1.2242,
+      "step": 60
+    },
+    {
+      "epoch": 0.0436817472698908,
+      "grad_norm": 1.0063410997390747,
+      "learning_rate": 0.0002,
+      "loss": 1.2635,
+      "step": 70
+    },
+    {
+      "epoch": 0.0499219968798752,
+      "grad_norm": 0.5335080027580261,
+      "learning_rate": 0.0002,
+      "loss": 1.2144,
+      "step": 80
+    },
+    {
+      "epoch": 0.056162246489859596,
+      "grad_norm": 0.5459050536155701,
+      "learning_rate": 0.0002,
+      "loss": 1.1897,
+      "step": 90
+    },
+    {
+      "epoch": 0.062402496099843996,
+      "grad_norm": 1.023564100265503,
+      "learning_rate": 0.0002,
+      "loss": 1.2977,
+      "step": 100
+    },
+    {
+      "epoch": 0.0686427457098284,
+      "grad_norm": 0.6166694760322571,
+      "learning_rate": 0.0002,
+      "loss": 1.1615,
+      "step": 110
+    },
+    {
+      "epoch": 0.0748829953198128,
+      "grad_norm": 0.6996013522148132,
+      "learning_rate": 0.0002,
+      "loss": 1.1544,
+      "step": 120
+    },
+    {
+      "epoch": 0.0811232449297972,
+      "grad_norm": 0.652845561504364,
+      "learning_rate": 0.0002,
+      "loss": 1.1415,
+      "step": 130
+    },
+    {
+      "epoch": 0.0873634945397816,
+      "grad_norm": 0.6401379108428955,
+      "learning_rate": 0.0002,
+      "loss": 1.213,
+      "step": 140
+    },
+    {
+      "epoch": 0.093603744149766,
+      "grad_norm": 0.8087514638900757,
+      "learning_rate": 0.0002,
+      "loss": 1.3089,
+      "step": 150
+    },
+    {
+      "epoch": 0.0998439937597504,
+      "grad_norm": 0.5709249973297119,
+      "learning_rate": 0.0002,
+      "loss": 1.1812,
+      "step": 160
+    },
+    {
+      "epoch": 0.1060842433697348,
+      "grad_norm": 0.6269909143447876,
+      "learning_rate": 0.0002,
+      "loss": 1.3098,
+      "step": 170
+    },
+    {
+      "epoch": 0.11232449297971919,
+      "grad_norm": 0.8364551663398743,
+      "learning_rate": 0.0002,
+      "loss": 1.3246,
+      "step": 180
+    },
+    {
+      "epoch": 0.11856474258970359,
+      "grad_norm": 0.8503940105438232,
+      "learning_rate": 0.0002,
+      "loss": 1.2288,
+      "step": 190
+    },
+    {
+      "epoch": 0.12480499219968799,
+      "grad_norm": 0.580119788646698,
+      "learning_rate": 0.0002,
+      "loss": 1.3022,
+      "step": 200
+    },
+    {
+      "epoch": 0.1310452418096724,
+      "grad_norm": 0.6041850447654724,
+      "learning_rate": 0.0002,
+      "loss": 1.1654,
+      "step": 210
+    },
+    {
+      "epoch": 0.1372854914196568,
+      "grad_norm": 1.061240553855896,
+      "learning_rate": 0.0002,
+      "loss": 1.3155,
+      "step": 220
+    },
+    {
+      "epoch": 0.1435257410296412,
+      "grad_norm": 0.7506526112556458,
+      "learning_rate": 0.0002,
+      "loss": 1.2319,
+      "step": 230
+    },
+    {
+      "epoch": 0.1497659906396256,
+      "grad_norm": 0.6461114883422852,
+      "learning_rate": 0.0002,
+      "loss": 1.1978,
+      "step": 240
+    },
+    {
+      "epoch": 0.15600624024961,
+      "grad_norm": 0.5992991924285889,
+      "learning_rate": 0.0002,
+      "loss": 1.2213,
+      "step": 250
+    },
+    {
+      "epoch": 0.1622464898595944,
+      "grad_norm": 0.6409032940864563,
+      "learning_rate": 0.0002,
+      "loss": 1.1884,
+      "step": 260
+    },
+    {
+      "epoch": 0.1684867394695788,
+      "grad_norm": 1.4767425060272217,
+      "learning_rate": 0.0002,
+      "loss": 1.3147,
+      "step": 270
+    },
+    {
+      "epoch": 0.1747269890795632,
+      "grad_norm": 0.8716024160385132,
+      "learning_rate": 0.0002,
+      "loss": 1.2098,
+      "step": 280
+    },
+    {
+      "epoch": 0.1809672386895476,
+      "grad_norm": 0.6197203993797302,
+      "learning_rate": 0.0002,
+      "loss": 1.1147,
+      "step": 290
+    },
+    {
+      "epoch": 0.187207488299532,
+      "grad_norm": 0.6289495825767517,
+      "learning_rate": 0.0002,
+      "loss": 1.2319,
+      "step": 300
+    },
+    {
+      "epoch": 0.1934477379095164,
+      "grad_norm": 0.660133421421051,
+      "learning_rate": 0.0002,
+      "loss": 1.1972,
+      "step": 310
+    },
+    {
+      "epoch": 0.1996879875195008,
+      "grad_norm": 0.6656067371368408,
+      "learning_rate": 0.0002,
+      "loss": 1.1408,
+      "step": 320
+    },
+    {
+      "epoch": 0.2059282371294852,
+      "grad_norm": 1.1313635110855103,
+      "learning_rate": 0.0002,
+      "loss": 1.196,
+      "step": 330
+    },
+    {
+      "epoch": 0.2121684867394696,
+      "grad_norm": 0.5510236620903015,
+      "learning_rate": 0.0002,
+      "loss": 1.2038,
+      "step": 340
+    },
+    {
+      "epoch": 0.21840873634945399,
+      "grad_norm": 0.6822632551193237,
+      "learning_rate": 0.0002,
+      "loss": 1.1399,
+      "step": 350
+    },
+    {
+      "epoch": 0.22464898595943839,
+      "grad_norm": 0.9127046465873718,
+      "learning_rate": 0.0002,
+      "loss": 1.1951,
+      "step": 360
+    },
+    {
+      "epoch": 0.23088923556942278,
+      "grad_norm": 0.6233941912651062,
+      "learning_rate": 0.0002,
+      "loss": 1.1956,
+      "step": 370
+    },
+    {
+      "epoch": 0.23712948517940718,
+      "grad_norm": 0.7165210843086243,
+      "learning_rate": 0.0002,
+      "loss": 1.2959,
+      "step": 380
+    },
+    {
+      "epoch": 0.24336973478939158,
+      "grad_norm": 0.668282687664032,
+      "learning_rate": 0.0002,
+      "loss": 1.2659,
+      "step": 390
+    },
+    {
+      "epoch": 0.24960998439937598,
+      "grad_norm": 0.6597710847854614,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 400
+    },
+    {
+      "epoch": 0.25585023400936036,
+      "grad_norm": 0.516697347164154,
+      "learning_rate": 0.0002,
+      "loss": 1.2153,
+      "step": 410
+    },
+    {
+      "epoch": 0.2620904836193448,
+      "grad_norm": 0.6005304455757141,
+      "learning_rate": 0.0002,
+      "loss": 1.2043,
+      "step": 420
+    },
+    {
+      "epoch": 0.26833073322932915,
+      "grad_norm": 0.6785455346107483,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 430
+    },
+    {
+      "epoch": 0.2745709828393136,
+      "grad_norm": 0.5856916904449463,
+      "learning_rate": 0.0002,
+      "loss": 1.2708,
+      "step": 440
+    },
+    {
+      "epoch": 0.28081123244929795,
+      "grad_norm": 0.5484935641288757,
+      "learning_rate": 0.0002,
+      "loss": 1.2617,
+      "step": 450
+    },
+    {
+      "epoch": 0.2870514820592824,
+      "grad_norm": 0.566983699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 460
+    },
+    {
+      "epoch": 0.29329173166926675,
+      "grad_norm": 0.5833986401557922,
+      "learning_rate": 0.0002,
+      "loss": 1.2278,
+      "step": 470
+    },
+    {
+      "epoch": 0.2995319812792512,
+      "grad_norm": 0.5306270718574524,
+      "learning_rate": 0.0002,
+      "loss": 1.2581,
+      "step": 480
+    },
+    {
+      "epoch": 0.30577223088923555,
+      "grad_norm": 0.6077427864074707,
+      "learning_rate": 0.0002,
+      "loss": 1.1978,
+      "step": 490
+    },
+    {
+      "epoch": 0.31201248049922,
+      "grad_norm": 0.629591166973114,
+      "learning_rate": 0.0002,
+      "loss": 1.2985,
+      "step": 500
+    },
+    {
+      "epoch": 0.31825273010920435,
+      "grad_norm": 0.7811625599861145,
+      "learning_rate": 0.0002,
+      "loss": 1.1428,
+      "step": 510
+    },
+    {
+      "epoch": 0.3244929797191888,
+      "grad_norm": 0.7362602353096008,
+      "learning_rate": 0.0002,
+      "loss": 1.1695,
+      "step": 520
+    },
+    {
+      "epoch": 0.33073322932917315,
+      "grad_norm": 0.551548957824707,
+      "learning_rate": 0.0002,
+      "loss": 1.185,
+      "step": 530
+    },
+    {
+      "epoch": 0.3369734789391576,
+      "grad_norm": 0.7770114541053772,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 540
+    },
+    {
+      "epoch": 0.34321372854914195,
+      "grad_norm": 0.7805799841880798,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 550
+    },
+    {
+      "epoch": 0.3494539781591264,
+      "grad_norm": 0.5508017539978027,
+      "learning_rate": 0.0002,
+      "loss": 1.2126,
+      "step": 560
+    },
+    {
+      "epoch": 0.35569422776911075,
+      "grad_norm": 0.8626343607902527,
+      "learning_rate": 0.0002,
+      "loss": 1.1915,
+      "step": 570
+    },
+    {
+      "epoch": 0.3619344773790952,
+      "grad_norm": 0.6867557168006897,
+      "learning_rate": 0.0002,
+      "loss": 1.3052,
+      "step": 580
+    },
+    {
+      "epoch": 0.36817472698907955,
+      "grad_norm": 1.0430885553359985,
+      "learning_rate": 0.0002,
+      "loss": 1.1494,
+      "step": 590
+    },
+    {
+      "epoch": 0.374414976599064,
+      "grad_norm": 0.7436304092407227,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 600
+    },
+    {
+      "epoch": 0.38065522620904835,
+      "grad_norm": 0.5486089587211609,
+      "learning_rate": 0.0002,
+      "loss": 1.1647,
+      "step": 610
+    },
+    {
+      "epoch": 0.3868954758190328,
+      "grad_norm": 0.590490996837616,
+      "learning_rate": 0.0002,
+      "loss": 1.2313,
+      "step": 620
+    },
+    {
+      "epoch": 0.39313572542901715,
+      "grad_norm": 0.6123483180999756,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 630
+    },
+    {
+      "epoch": 0.3993759750390016,
+      "grad_norm": 0.6024467349052429,
+      "learning_rate": 0.0002,
+      "loss": 1.2625,
+      "step": 640
+    },
+    {
+      "epoch": 0.40561622464898595,
+      "grad_norm": 0.7221348881721497,
+      "learning_rate": 0.0002,
+      "loss": 1.1165,
+      "step": 650
+    },
+    {
+      "epoch": 0.4118564742589704,
+      "grad_norm": 1.225263237953186,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 660
+    },
+    {
+      "epoch": 0.41809672386895474,
+      "grad_norm": 0.69964599609375,
+      "learning_rate": 0.0002,
+      "loss": 1.2144,
+      "step": 670
+    },
+    {
+      "epoch": 0.4243369734789392,
+      "grad_norm": 0.8724095821380615,
+      "learning_rate": 0.0002,
+      "loss": 1.2997,
+      "step": 680
+    },
+    {
+      "epoch": 0.43057722308892354,
+      "grad_norm": 0.9811216592788696,
+      "learning_rate": 0.0002,
+      "loss": 1.2633,
+      "step": 690
+    },
+    {
+      "epoch": 0.43681747269890797,
+      "grad_norm": 0.6494196057319641,
+      "learning_rate": 0.0002,
+      "loss": 1.2051,
+      "step": 700
+    },
+    {
+      "epoch": 0.44305772230889234,
+      "grad_norm": 0.683635950088501,
+      "learning_rate": 0.0002,
+      "loss": 1.1308,
+      "step": 710
+    },
+    {
+      "epoch": 0.44929797191887677,
+      "grad_norm": 0.7341657876968384,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 720
+    },
+    {
+      "epoch": 0.45553822152886114,
+      "grad_norm": 0.5400960445404053,
+      "learning_rate": 0.0002,
+      "loss": 1.1967,
+      "step": 730
+    },
+    {
+      "epoch": 0.46177847113884557,
+      "grad_norm": 0.7045732736587524,
+      "learning_rate": 0.0002,
+      "loss": 1.2383,
+      "step": 740
+    },
+    {
+      "epoch": 0.46801872074882994,
+      "grad_norm": 0.6595138907432556,
+      "learning_rate": 0.0002,
+      "loss": 1.1054,
+      "step": 750
+    },
+    {
+      "epoch": 0.47425897035881437,
+      "grad_norm": 0.7014768719673157,
+      "learning_rate": 0.0002,
+      "loss": 1.1268,
+      "step": 760
+    },
+    {
+      "epoch": 0.48049921996879874,
+      "grad_norm": 0.7153908014297485,
+      "learning_rate": 0.0002,
+      "loss": 1.2706,
+      "step": 770
+    },
+    {
+      "epoch": 0.48673946957878317,
+      "grad_norm": 0.5434338450431824,
+      "learning_rate": 0.0002,
+      "loss": 1.1744,
+      "step": 780
+    },
+    {
+      "epoch": 0.49297971918876754,
+      "grad_norm": 0.626314103603363,
+      "learning_rate": 0.0002,
+      "loss": 1.0866,
+      "step": 790
+    },
+    {
+      "epoch": 0.49921996879875197,
+      "grad_norm": 0.6473543643951416,
+      "learning_rate": 0.0002,
+      "loss": 1.2015,
+      "step": 800
+    },
+    {
+      "epoch": 0.5054602184087363,
+      "grad_norm": 0.6651485562324524,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 810
+    },
+    {
+      "epoch": 0.5117004680187207,
+      "grad_norm": 0.618462085723877,
+      "learning_rate": 0.0002,
+      "loss": 1.2447,
+      "step": 820
+    },
+    {
+      "epoch": 0.5179407176287052,
+      "grad_norm": 0.7226157784461975,
+      "learning_rate": 0.0002,
+      "loss": 1.2192,
+      "step": 830
+    },
+    {
+      "epoch": 0.5241809672386896,
+      "grad_norm": 1.5444509983062744,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 840
+    },
+    {
+      "epoch": 0.5304212168486739,
+      "grad_norm": 0.6177148818969727,
+      "learning_rate": 0.0002,
+      "loss": 1.1245,
+      "step": 850
+    },
+    {
+      "epoch": 0.5366614664586583,
+      "grad_norm": 0.711006224155426,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 860
+    },
+    {
+      "epoch": 0.5429017160686428,
+      "grad_norm": 0.7458148002624512,
+      "learning_rate": 0.0002,
+      "loss": 1.2325,
+      "step": 870
+    },
+    {
+      "epoch": 0.5491419656786272,
+      "grad_norm": 0.5732790231704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1721,
+      "step": 880
+    },
+    {
+      "epoch": 0.5553822152886115,
+      "grad_norm": 0.6373953819274902,
+      "learning_rate": 0.0002,
+      "loss": 1.2311,
+      "step": 890
+    },
+    {
+      "epoch": 0.5616224648985959,
+      "grad_norm": 1.0794939994812012,
+      "learning_rate": 0.0002,
+      "loss": 1.1618,
+      "step": 900
+    },
+    {
+      "epoch": 0.5678627145085804,
+      "grad_norm": 0.6735630035400391,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 910
+    },
+    {
+      "epoch": 0.5741029641185648,
+      "grad_norm": 0.6513162851333618,
+      "learning_rate": 0.0002,
+      "loss": 1.2506,
+      "step": 920
+    },
+    {
+      "epoch": 0.5803432137285491,
+      "grad_norm": 1.188833236694336,
+      "learning_rate": 0.0002,
+      "loss": 1.1328,
+      "step": 930
+    },
+    {
+      "epoch": 0.5865834633385335,
+      "grad_norm": 0.6634365320205688,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 940
+    },
+    {
+      "epoch": 0.592823712948518,
+      "grad_norm": 2.542186975479126,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 950
+    },
+    {
+      "epoch": 0.5990639625585024,
+      "grad_norm": 0.9277311563491821,
+      "learning_rate": 0.0002,
+      "loss": 1.1844,
+      "step": 960
+    },
+    {
+      "epoch": 0.6053042121684867,
+      "grad_norm": 0.7193790674209595,
+      "learning_rate": 0.0002,
+      "loss": 1.1732,
+      "step": 970
+    },
+    {
+      "epoch": 0.6115444617784711,
+      "grad_norm": 0.5681775808334351,
+      "learning_rate": 0.0002,
+      "loss": 1.2559,
+      "step": 980
+    },
+    {
+      "epoch": 0.6177847113884556,
+      "grad_norm": 0.6696691513061523,
+      "learning_rate": 0.0002,
+      "loss": 1.1388,
+      "step": 990
+    },
+    {
+      "epoch": 0.62402496099844,
+      "grad_norm": 0.721344530582428,
+      "learning_rate": 0.0002,
+      "loss": 1.1189,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6302652106084243,
+      "grad_norm": 0.8425031304359436,
+      "learning_rate": 0.0002,
+      "loss": 1.1777,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6365054602184087,
+      "grad_norm": 0.7791627645492554,
+      "learning_rate": 0.0002,
+      "loss": 1.2199,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6427457098283932,
+      "grad_norm": 0.5969202518463135,
+      "learning_rate": 0.0002,
+      "loss": 1.1708,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6489859594383776,
+      "grad_norm": 0.8637568950653076,
+      "learning_rate": 0.0002,
+      "loss": 1.272,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6552262090483619,
+      "grad_norm": 0.6667028069496155,
+      "learning_rate": 0.0002,
+      "loss": 1.0522,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6614664586583463,
+      "grad_norm": 0.8002216219902039,
+      "learning_rate": 0.0002,
+      "loss": 1.2185,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6677067082683308,
+      "grad_norm": 0.6133790612220764,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6739469578783152,
+      "grad_norm": 0.8466842770576477,
+      "learning_rate": 0.0002,
+      "loss": 1.2591,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6801872074882995,
+      "grad_norm": 0.534599244594574,
+      "learning_rate": 0.0002,
+      "loss": 1.1045,
+      "step": 1090
+    },
+    {
+      "epoch": 0.6864274570982839,
+      "grad_norm": 0.5979007482528687,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 1100
+    },
+    {
+      "epoch": 0.6926677067082684,
+      "grad_norm": 0.6573676466941833,
+      "learning_rate": 0.0002,
+      "loss": 1.0852,
+      "step": 1110
+    },
+    {
+      "epoch": 0.6989079563182528,
+      "grad_norm": 2.110194683074951,
+      "learning_rate": 0.0002,
+      "loss": 1.1667,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7051482059282371,
+      "grad_norm": 0.6480202674865723,
+      "learning_rate": 0.0002,
+      "loss": 1.1643,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7113884555382215,
+      "grad_norm": 0.9007996320724487,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1140
+    },
+    {
+      "epoch": 0.717628705148206,
+      "grad_norm": 0.864778995513916,
+      "learning_rate": 0.0002,
+      "loss": 1.1982,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7238689547581904,
+      "grad_norm": 0.6132529377937317,
+      "learning_rate": 0.0002,
+      "loss": 1.0931,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7301092043681747,
+      "grad_norm": 0.6523057222366333,
+      "learning_rate": 0.0002,
+      "loss": 1.0875,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7363494539781591,
+      "grad_norm": 0.6278107166290283,
+      "learning_rate": 0.0002,
+      "loss": 1.183,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7425897035881436,
+      "grad_norm": 0.9367479085922241,
+      "learning_rate": 0.0002,
+      "loss": 1.2556,
+      "step": 1190
+    },
+    {
+      "epoch": 0.748829953198128,
+      "grad_norm": 0.6311790943145752,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7550702028081123,
+      "grad_norm": 0.594434916973114,
+      "learning_rate": 0.0002,
+      "loss": 1.1126,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7613104524180967,
+      "grad_norm": 0.7832707166671753,
+      "learning_rate": 0.0002,
+      "loss": 1.126,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7675507020280812,
+      "grad_norm": 0.6111505627632141,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7737909516380655,
+      "grad_norm": 0.6673868894577026,
+      "learning_rate": 0.0002,
+      "loss": 1.1395,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7800312012480499,
+      "grad_norm": 0.740943431854248,
+      "learning_rate": 0.0002,
+      "loss": 1.1455,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7862714508580343,
+      "grad_norm": 0.6874880790710449,
+      "learning_rate": 0.0002,
+      "loss": 1.2677,
+      "step": 1260
+    },
+    {
+      "epoch": 0.7925117004680188,
+      "grad_norm": 0.6566919684410095,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 1270
+    },
+    {
+      "epoch": 0.7987519500780031,
+      "grad_norm": 0.5894289612770081,
+      "learning_rate": 0.0002,
+      "loss": 1.1479,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8049921996879875,
+      "grad_norm": 0.8081681132316589,
+      "learning_rate": 0.0002,
+      "loss": 1.2025,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8112324492979719,
+      "grad_norm": 0.6353256106376648,
+      "learning_rate": 0.0002,
+      "loss": 1.1218,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8174726989079563,
+      "grad_norm": 0.5706279277801514,
+      "learning_rate": 0.0002,
+      "loss": 1.2244,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8237129485179407,
+      "grad_norm": 0.5903506278991699,
+      "learning_rate": 0.0002,
+      "loss": 1.1468,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8299531981279251,
+      "grad_norm": 0.5297395586967468,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8361934477379095,
+      "grad_norm": 0.6525882482528687,
+      "learning_rate": 0.0002,
+      "loss": 1.0841,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8424336973478939,
+      "grad_norm": 0.8756698966026306,
+      "learning_rate": 0.0002,
+      "loss": 1.194,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8486739469578783,
+      "grad_norm": 1.0452430248260498,
+      "learning_rate": 0.0002,
+      "loss": 1.2273,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8549141965678627,
+      "grad_norm": 0.6416711807250977,
+      "learning_rate": 0.0002,
+      "loss": 1.0973,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8611544461778471,
+      "grad_norm": 0.49327632784843445,
+      "learning_rate": 0.0002,
+      "loss": 1.2116,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8673946957878315,
+      "grad_norm": 0.6748191714286804,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8736349453978159,
+      "grad_norm": 1.153362512588501,
+      "learning_rate": 0.0002,
+      "loss": 1.2677,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8798751950078003,
+      "grad_norm": 0.8151076436042786,
+      "learning_rate": 0.0002,
+      "loss": 1.1562,
+      "step": 1410
+    },
+    {
+      "epoch": 0.8861154446177847,
+      "grad_norm": 1.00041925907135,
+      "learning_rate": 0.0002,
+      "loss": 1.1389,
+      "step": 1420
+    },
+    {
+      "epoch": 0.8923556942277691,
+      "grad_norm": 0.7730890512466431,
+      "learning_rate": 0.0002,
+      "loss": 1.2937,
+      "step": 1430
+    },
+    {
+      "epoch": 0.8985959438377535,
+      "grad_norm": 0.5944583415985107,
+      "learning_rate": 0.0002,
+      "loss": 1.1201,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9048361934477379,
+      "grad_norm": 1.2791529893875122,
+      "learning_rate": 0.0002,
+      "loss": 1.055,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9110764430577223,
+      "grad_norm": 0.6362823843955994,
+      "learning_rate": 0.0002,
+      "loss": 1.1659,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9173166926677067,
+      "grad_norm": 0.5871922969818115,
+      "learning_rate": 0.0002,
+      "loss": 1.1327,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9235569422776911,
+      "grad_norm": 0.8008357286453247,
+      "learning_rate": 0.0002,
+      "loss": 1.1736,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9297971918876755,
+      "grad_norm": 0.89249587059021,
+      "learning_rate": 0.0002,
+      "loss": 1.1132,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9360374414976599,
+      "grad_norm": 0.7067795395851135,
+      "learning_rate": 0.0002,
+      "loss": 1.1981,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9422776911076443,
+      "grad_norm": 0.8353140354156494,
+      "learning_rate": 0.0002,
+      "loss": 1.2021,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9485179407176287,
+      "grad_norm": 0.7982086539268494,
+      "learning_rate": 0.0002,
+      "loss": 1.1552,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9547581903276131,
+      "grad_norm": 0.6895356178283691,
+      "learning_rate": 0.0002,
+      "loss": 1.1823,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9609984399375975,
+      "grad_norm": 0.6823073029518127,
+      "learning_rate": 0.0002,
+      "loss": 1.1949,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9672386895475819,
+      "grad_norm": 0.6817622184753418,
+      "learning_rate": 0.0002,
+      "loss": 1.0287,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9734789391575663,
+      "grad_norm": 2.095001459121704,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 1560
+    },
+    {
+      "epoch": 0.9797191887675507,
+      "grad_norm": 1.0685795545578003,
+      "learning_rate": 0.0002,
+      "loss": 1.1759,
+      "step": 1570
+    },
+    {
+      "epoch": 0.9859594383775351,
+      "grad_norm": 0.6668464541435242,
+      "learning_rate": 0.0002,
+      "loss": 1.2076,
+      "step": 1580
+    },
+    {
+      "epoch": 0.9921996879875195,
+      "grad_norm": 0.6560982465744019,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 1590
+    },
+    {
+      "epoch": 0.9984399375975039,
+      "grad_norm": 0.634287416934967,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 1600
+    },
+    {
+      "epoch": 0.9996879875195008,
+      "eval_loss": 1.1827317476272583,
+      "eval_runtime": 63.566,
+      "eval_samples_per_second": 7.174,
+      "eval_steps_per_second": 0.897,
+      "step": 1602
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 12816,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.0309249548288e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e76e5f8e457239ae8e9a3a140db914985c1c9359562528b6e52d41e0a60d0a11
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:287190a522caadedb08a02f1ccd3a4293855b5bb21a6e1406b24d5fc9c22b743
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6742d868d3d9abd3adefc5a3889f9088e650c16c3819e10d9f31bf93e97a01
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02aac109060891b3dfe236cd1c50c5c252c77d603dfacd4b65429cb2127d7b0a
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be062595fadd5fc6ca5c2b09baeb3e22074106faa3c516009a542801dd75cd38
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2289 @@

+{
+  "best_metric": 1.1564315557479858,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-9809-sd-42/checkpoint-3205",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 3205,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0062402496099844,
+      "grad_norm": 1.031922698020935,
+      "learning_rate": 0.0002,
+      "loss": 1.9624,
+      "step": 10
+    },
+    {
+      "epoch": 0.0124804992199688,
+      "grad_norm": 0.7286948561668396,
+      "learning_rate": 0.0002,
+      "loss": 1.5889,
+      "step": 20
+    },
+    {
+      "epoch": 0.0187207488299532,
+      "grad_norm": 0.7649297714233398,
+      "learning_rate": 0.0002,
+      "loss": 1.5445,
+      "step": 30
+    },
+    {
+      "epoch": 0.0249609984399376,
+      "grad_norm": 0.9243895411491394,
+      "learning_rate": 0.0002,
+      "loss": 1.6042,
+      "step": 40
+    },
+    {
+      "epoch": 0.031201248049921998,
+      "grad_norm": 1.2203439474105835,
+      "learning_rate": 0.0002,
+      "loss": 1.4839,
+      "step": 50
+    },
+    {
+      "epoch": 0.0374414976599064,
+      "grad_norm": 0.7954166531562805,
+      "learning_rate": 0.0002,
+      "loss": 1.2242,
+      "step": 60
+    },
+    {
+      "epoch": 0.0436817472698908,
+      "grad_norm": 1.0063410997390747,
+      "learning_rate": 0.0002,
+      "loss": 1.2635,
+      "step": 70
+    },
+    {
+      "epoch": 0.0499219968798752,
+      "grad_norm": 0.5335080027580261,
+      "learning_rate": 0.0002,
+      "loss": 1.2144,
+      "step": 80
+    },
+    {
+      "epoch": 0.056162246489859596,
+      "grad_norm": 0.5459050536155701,
+      "learning_rate": 0.0002,
+      "loss": 1.1897,
+      "step": 90
+    },
+    {
+      "epoch": 0.062402496099843996,
+      "grad_norm": 1.023564100265503,
+      "learning_rate": 0.0002,
+      "loss": 1.2977,
+      "step": 100
+    },
+    {
+      "epoch": 0.0686427457098284,
+      "grad_norm": 0.6166694760322571,
+      "learning_rate": 0.0002,
+      "loss": 1.1615,
+      "step": 110
+    },
+    {
+      "epoch": 0.0748829953198128,
+      "grad_norm": 0.6996013522148132,
+      "learning_rate": 0.0002,
+      "loss": 1.1544,
+      "step": 120
+    },
+    {
+      "epoch": 0.0811232449297972,
+      "grad_norm": 0.652845561504364,
+      "learning_rate": 0.0002,
+      "loss": 1.1415,
+      "step": 130
+    },
+    {
+      "epoch": 0.0873634945397816,
+      "grad_norm": 0.6401379108428955,
+      "learning_rate": 0.0002,
+      "loss": 1.213,
+      "step": 140
+    },
+    {
+      "epoch": 0.093603744149766,
+      "grad_norm": 0.8087514638900757,
+      "learning_rate": 0.0002,
+      "loss": 1.3089,
+      "step": 150
+    },
+    {
+      "epoch": 0.0998439937597504,
+      "grad_norm": 0.5709249973297119,
+      "learning_rate": 0.0002,
+      "loss": 1.1812,
+      "step": 160
+    },
+    {
+      "epoch": 0.1060842433697348,
+      "grad_norm": 0.6269909143447876,
+      "learning_rate": 0.0002,
+      "loss": 1.3098,
+      "step": 170
+    },
+    {
+      "epoch": 0.11232449297971919,
+      "grad_norm": 0.8364551663398743,
+      "learning_rate": 0.0002,
+      "loss": 1.3246,
+      "step": 180
+    },
+    {
+      "epoch": 0.11856474258970359,
+      "grad_norm": 0.8503940105438232,
+      "learning_rate": 0.0002,
+      "loss": 1.2288,
+      "step": 190
+    },
+    {
+      "epoch": 0.12480499219968799,
+      "grad_norm": 0.580119788646698,
+      "learning_rate": 0.0002,
+      "loss": 1.3022,
+      "step": 200
+    },
+    {
+      "epoch": 0.1310452418096724,
+      "grad_norm": 0.6041850447654724,
+      "learning_rate": 0.0002,
+      "loss": 1.1654,
+      "step": 210
+    },
+    {
+      "epoch": 0.1372854914196568,
+      "grad_norm": 1.061240553855896,
+      "learning_rate": 0.0002,
+      "loss": 1.3155,
+      "step": 220
+    },
+    {
+      "epoch": 0.1435257410296412,
+      "grad_norm": 0.7506526112556458,
+      "learning_rate": 0.0002,
+      "loss": 1.2319,
+      "step": 230
+    },
+    {
+      "epoch": 0.1497659906396256,
+      "grad_norm": 0.6461114883422852,
+      "learning_rate": 0.0002,
+      "loss": 1.1978,
+      "step": 240
+    },
+    {
+      "epoch": 0.15600624024961,
+      "grad_norm": 0.5992991924285889,
+      "learning_rate": 0.0002,
+      "loss": 1.2213,
+      "step": 250
+    },
+    {
+      "epoch": 0.1622464898595944,
+      "grad_norm": 0.6409032940864563,
+      "learning_rate": 0.0002,
+      "loss": 1.1884,
+      "step": 260
+    },
+    {
+      "epoch": 0.1684867394695788,
+      "grad_norm": 1.4767425060272217,
+      "learning_rate": 0.0002,
+      "loss": 1.3147,
+      "step": 270
+    },
+    {
+      "epoch": 0.1747269890795632,
+      "grad_norm": 0.8716024160385132,
+      "learning_rate": 0.0002,
+      "loss": 1.2098,
+      "step": 280
+    },
+    {
+      "epoch": 0.1809672386895476,
+      "grad_norm": 0.6197203993797302,
+      "learning_rate": 0.0002,
+      "loss": 1.1147,
+      "step": 290
+    },
+    {
+      "epoch": 0.187207488299532,
+      "grad_norm": 0.6289495825767517,
+      "learning_rate": 0.0002,
+      "loss": 1.2319,
+      "step": 300
+    },
+    {
+      "epoch": 0.1934477379095164,
+      "grad_norm": 0.660133421421051,
+      "learning_rate": 0.0002,
+      "loss": 1.1972,
+      "step": 310
+    },
+    {
+      "epoch": 0.1996879875195008,
+      "grad_norm": 0.6656067371368408,
+      "learning_rate": 0.0002,
+      "loss": 1.1408,
+      "step": 320
+    },
+    {
+      "epoch": 0.2059282371294852,
+      "grad_norm": 1.1313635110855103,
+      "learning_rate": 0.0002,
+      "loss": 1.196,
+      "step": 330
+    },
+    {
+      "epoch": 0.2121684867394696,
+      "grad_norm": 0.5510236620903015,
+      "learning_rate": 0.0002,
+      "loss": 1.2038,
+      "step": 340
+    },
+    {
+      "epoch": 0.21840873634945399,
+      "grad_norm": 0.6822632551193237,
+      "learning_rate": 0.0002,
+      "loss": 1.1399,
+      "step": 350
+    },
+    {
+      "epoch": 0.22464898595943839,
+      "grad_norm": 0.9127046465873718,
+      "learning_rate": 0.0002,
+      "loss": 1.1951,
+      "step": 360
+    },
+    {
+      "epoch": 0.23088923556942278,
+      "grad_norm": 0.6233941912651062,
+      "learning_rate": 0.0002,
+      "loss": 1.1956,
+      "step": 370
+    },
+    {
+      "epoch": 0.23712948517940718,
+      "grad_norm": 0.7165210843086243,
+      "learning_rate": 0.0002,
+      "loss": 1.2959,
+      "step": 380
+    },
+    {
+      "epoch": 0.24336973478939158,
+      "grad_norm": 0.668282687664032,
+      "learning_rate": 0.0002,
+      "loss": 1.2659,
+      "step": 390
+    },
+    {
+      "epoch": 0.24960998439937598,
+      "grad_norm": 0.6597710847854614,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 400
+    },
+    {
+      "epoch": 0.25585023400936036,
+      "grad_norm": 0.516697347164154,
+      "learning_rate": 0.0002,
+      "loss": 1.2153,
+      "step": 410
+    },
+    {
+      "epoch": 0.2620904836193448,
+      "grad_norm": 0.6005304455757141,
+      "learning_rate": 0.0002,
+      "loss": 1.2043,
+      "step": 420
+    },
+    {
+      "epoch": 0.26833073322932915,
+      "grad_norm": 0.6785455346107483,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 430
+    },
+    {
+      "epoch": 0.2745709828393136,
+      "grad_norm": 0.5856916904449463,
+      "learning_rate": 0.0002,
+      "loss": 1.2708,
+      "step": 440
+    },
+    {
+      "epoch": 0.28081123244929795,
+      "grad_norm": 0.5484935641288757,
+      "learning_rate": 0.0002,
+      "loss": 1.2617,
+      "step": 450
+    },
+    {
+      "epoch": 0.2870514820592824,
+      "grad_norm": 0.566983699798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 460
+    },
+    {
+      "epoch": 0.29329173166926675,
+      "grad_norm": 0.5833986401557922,
+      "learning_rate": 0.0002,
+      "loss": 1.2278,
+      "step": 470
+    },
+    {
+      "epoch": 0.2995319812792512,
+      "grad_norm": 0.5306270718574524,
+      "learning_rate": 0.0002,
+      "loss": 1.2581,
+      "step": 480
+    },
+    {
+      "epoch": 0.30577223088923555,
+      "grad_norm": 0.6077427864074707,
+      "learning_rate": 0.0002,
+      "loss": 1.1978,
+      "step": 490
+    },
+    {
+      "epoch": 0.31201248049922,
+      "grad_norm": 0.629591166973114,
+      "learning_rate": 0.0002,
+      "loss": 1.2985,
+      "step": 500
+    },
+    {
+      "epoch": 0.31825273010920435,
+      "grad_norm": 0.7811625599861145,
+      "learning_rate": 0.0002,
+      "loss": 1.1428,
+      "step": 510
+    },
+    {
+      "epoch": 0.3244929797191888,
+      "grad_norm": 0.7362602353096008,
+      "learning_rate": 0.0002,
+      "loss": 1.1695,
+      "step": 520
+    },
+    {
+      "epoch": 0.33073322932917315,
+      "grad_norm": 0.551548957824707,
+      "learning_rate": 0.0002,
+      "loss": 1.185,
+      "step": 530
+    },
+    {
+      "epoch": 0.3369734789391576,
+      "grad_norm": 0.7770114541053772,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 540
+    },
+    {
+      "epoch": 0.34321372854914195,
+      "grad_norm": 0.7805799841880798,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 550
+    },
+    {
+      "epoch": 0.3494539781591264,
+      "grad_norm": 0.5508017539978027,
+      "learning_rate": 0.0002,
+      "loss": 1.2126,
+      "step": 560
+    },
+    {
+      "epoch": 0.35569422776911075,
+      "grad_norm": 0.8626343607902527,
+      "learning_rate": 0.0002,
+      "loss": 1.1915,
+      "step": 570
+    },
+    {
+      "epoch": 0.3619344773790952,
+      "grad_norm": 0.6867557168006897,
+      "learning_rate": 0.0002,
+      "loss": 1.3052,
+      "step": 580
+    },
+    {
+      "epoch": 0.36817472698907955,
+      "grad_norm": 1.0430885553359985,
+      "learning_rate": 0.0002,
+      "loss": 1.1494,
+      "step": 590
+    },
+    {
+      "epoch": 0.374414976599064,
+      "grad_norm": 0.7436304092407227,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 600
+    },
+    {
+      "epoch": 0.38065522620904835,
+      "grad_norm": 0.5486089587211609,
+      "learning_rate": 0.0002,
+      "loss": 1.1647,
+      "step": 610
+    },
+    {
+      "epoch": 0.3868954758190328,
+      "grad_norm": 0.590490996837616,
+      "learning_rate": 0.0002,
+      "loss": 1.2313,
+      "step": 620
+    },
+    {
+      "epoch": 0.39313572542901715,
+      "grad_norm": 0.6123483180999756,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 630
+    },
+    {
+      "epoch": 0.3993759750390016,
+      "grad_norm": 0.6024467349052429,
+      "learning_rate": 0.0002,
+      "loss": 1.2625,
+      "step": 640
+    },
+    {
+      "epoch": 0.40561622464898595,
+      "grad_norm": 0.7221348881721497,
+      "learning_rate": 0.0002,
+      "loss": 1.1165,
+      "step": 650
+    },
+    {
+      "epoch": 0.4118564742589704,
+      "grad_norm": 1.225263237953186,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 660
+    },
+    {
+      "epoch": 0.41809672386895474,
+      "grad_norm": 0.69964599609375,
+      "learning_rate": 0.0002,
+      "loss": 1.2144,
+      "step": 670
+    },
+    {
+      "epoch": 0.4243369734789392,
+      "grad_norm": 0.8724095821380615,
+      "learning_rate": 0.0002,
+      "loss": 1.2997,
+      "step": 680
+    },
+    {
+      "epoch": 0.43057722308892354,
+      "grad_norm": 0.9811216592788696,
+      "learning_rate": 0.0002,
+      "loss": 1.2633,
+      "step": 690
+    },
+    {
+      "epoch": 0.43681747269890797,
+      "grad_norm": 0.6494196057319641,
+      "learning_rate": 0.0002,
+      "loss": 1.2051,
+      "step": 700
+    },
+    {
+      "epoch": 0.44305772230889234,
+      "grad_norm": 0.683635950088501,
+      "learning_rate": 0.0002,
+      "loss": 1.1308,
+      "step": 710
+    },
+    {
+      "epoch": 0.44929797191887677,
+      "grad_norm": 0.7341657876968384,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 720
+    },
+    {
+      "epoch": 0.45553822152886114,
+      "grad_norm": 0.5400960445404053,
+      "learning_rate": 0.0002,
+      "loss": 1.1967,
+      "step": 730
+    },
+    {
+      "epoch": 0.46177847113884557,
+      "grad_norm": 0.7045732736587524,
+      "learning_rate": 0.0002,
+      "loss": 1.2383,
+      "step": 740
+    },
+    {
+      "epoch": 0.46801872074882994,
+      "grad_norm": 0.6595138907432556,
+      "learning_rate": 0.0002,
+      "loss": 1.1054,
+      "step": 750
+    },
+    {
+      "epoch": 0.47425897035881437,
+      "grad_norm": 0.7014768719673157,
+      "learning_rate": 0.0002,
+      "loss": 1.1268,
+      "step": 760
+    },
+    {
+      "epoch": 0.48049921996879874,
+      "grad_norm": 0.7153908014297485,
+      "learning_rate": 0.0002,
+      "loss": 1.2706,
+      "step": 770
+    },
+    {
+      "epoch": 0.48673946957878317,
+      "grad_norm": 0.5434338450431824,
+      "learning_rate": 0.0002,
+      "loss": 1.1744,
+      "step": 780
+    },
+    {
+      "epoch": 0.49297971918876754,
+      "grad_norm": 0.626314103603363,
+      "learning_rate": 0.0002,
+      "loss": 1.0866,
+      "step": 790
+    },
+    {
+      "epoch": 0.49921996879875197,
+      "grad_norm": 0.6473543643951416,
+      "learning_rate": 0.0002,
+      "loss": 1.2015,
+      "step": 800
+    },
+    {
+      "epoch": 0.5054602184087363,
+      "grad_norm": 0.6651485562324524,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 810
+    },
+    {
+      "epoch": 0.5117004680187207,
+      "grad_norm": 0.618462085723877,
+      "learning_rate": 0.0002,
+      "loss": 1.2447,
+      "step": 820
+    },
+    {
+      "epoch": 0.5179407176287052,
+      "grad_norm": 0.7226157784461975,
+      "learning_rate": 0.0002,
+      "loss": 1.2192,
+      "step": 830
+    },
+    {
+      "epoch": 0.5241809672386896,
+      "grad_norm": 1.5444509983062744,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 840
+    },
+    {
+      "epoch": 0.5304212168486739,
+      "grad_norm": 0.6177148818969727,
+      "learning_rate": 0.0002,
+      "loss": 1.1245,
+      "step": 850
+    },
+    {
+      "epoch": 0.5366614664586583,
+      "grad_norm": 0.711006224155426,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 860
+    },
+    {
+      "epoch": 0.5429017160686428,
+      "grad_norm": 0.7458148002624512,
+      "learning_rate": 0.0002,
+      "loss": 1.2325,
+      "step": 870
+    },
+    {
+      "epoch": 0.5491419656786272,
+      "grad_norm": 0.5732790231704712,
+      "learning_rate": 0.0002,
+      "loss": 1.1721,
+      "step": 880
+    },
+    {
+      "epoch": 0.5553822152886115,
+      "grad_norm": 0.6373953819274902,
+      "learning_rate": 0.0002,
+      "loss": 1.2311,
+      "step": 890
+    },
+    {
+      "epoch": 0.5616224648985959,
+      "grad_norm": 1.0794939994812012,
+      "learning_rate": 0.0002,
+      "loss": 1.1618,
+      "step": 900
+    },
+    {
+      "epoch": 0.5678627145085804,
+      "grad_norm": 0.6735630035400391,
+      "learning_rate": 0.0002,
+      "loss": 1.1471,
+      "step": 910
+    },
+    {
+      "epoch": 0.5741029641185648,
+      "grad_norm": 0.6513162851333618,
+      "learning_rate": 0.0002,
+      "loss": 1.2506,
+      "step": 920
+    },
+    {
+      "epoch": 0.5803432137285491,
+      "grad_norm": 1.188833236694336,
+      "learning_rate": 0.0002,
+      "loss": 1.1328,
+      "step": 930
+    },
+    {
+      "epoch": 0.5865834633385335,
+      "grad_norm": 0.6634365320205688,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 940
+    },
+    {
+      "epoch": 0.592823712948518,
+      "grad_norm": 2.542186975479126,
+      "learning_rate": 0.0002,
+      "loss": 1.1143,
+      "step": 950
+    },
+    {
+      "epoch": 0.5990639625585024,
+      "grad_norm": 0.9277311563491821,
+      "learning_rate": 0.0002,
+      "loss": 1.1844,
+      "step": 960
+    },
+    {
+      "epoch": 0.6053042121684867,
+      "grad_norm": 0.7193790674209595,
+      "learning_rate": 0.0002,
+      "loss": 1.1732,
+      "step": 970
+    },
+    {
+      "epoch": 0.6115444617784711,
+      "grad_norm": 0.5681775808334351,
+      "learning_rate": 0.0002,
+      "loss": 1.2559,
+      "step": 980
+    },
+    {
+      "epoch": 0.6177847113884556,
+      "grad_norm": 0.6696691513061523,
+      "learning_rate": 0.0002,
+      "loss": 1.1388,
+      "step": 990
+    },
+    {
+      "epoch": 0.62402496099844,
+      "grad_norm": 0.721344530582428,
+      "learning_rate": 0.0002,
+      "loss": 1.1189,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6302652106084243,
+      "grad_norm": 0.8425031304359436,
+      "learning_rate": 0.0002,
+      "loss": 1.1777,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6365054602184087,
+      "grad_norm": 0.7791627645492554,
+      "learning_rate": 0.0002,
+      "loss": 1.2199,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6427457098283932,
+      "grad_norm": 0.5969202518463135,
+      "learning_rate": 0.0002,
+      "loss": 1.1708,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6489859594383776,
+      "grad_norm": 0.8637568950653076,
+      "learning_rate": 0.0002,
+      "loss": 1.272,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6552262090483619,
+      "grad_norm": 0.6667028069496155,
+      "learning_rate": 0.0002,
+      "loss": 1.0522,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6614664586583463,
+      "grad_norm": 0.8002216219902039,
+      "learning_rate": 0.0002,
+      "loss": 1.2185,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6677067082683308,
+      "grad_norm": 0.6133790612220764,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6739469578783152,
+      "grad_norm": 0.8466842770576477,
+      "learning_rate": 0.0002,
+      "loss": 1.2591,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6801872074882995,
+      "grad_norm": 0.534599244594574,
+      "learning_rate": 0.0002,
+      "loss": 1.1045,
+      "step": 1090
+    },
+    {
+      "epoch": 0.6864274570982839,
+      "grad_norm": 0.5979007482528687,
+      "learning_rate": 0.0002,
+      "loss": 1.1631,
+      "step": 1100
+    },
+    {
+      "epoch": 0.6926677067082684,
+      "grad_norm": 0.6573676466941833,
+      "learning_rate": 0.0002,
+      "loss": 1.0852,
+      "step": 1110
+    },
+    {
+      "epoch": 0.6989079563182528,
+      "grad_norm": 2.110194683074951,
+      "learning_rate": 0.0002,
+      "loss": 1.1667,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7051482059282371,
+      "grad_norm": 0.6480202674865723,
+      "learning_rate": 0.0002,
+      "loss": 1.1643,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7113884555382215,
+      "grad_norm": 0.9007996320724487,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 1140
+    },
+    {
+      "epoch": 0.717628705148206,
+      "grad_norm": 0.864778995513916,
+      "learning_rate": 0.0002,
+      "loss": 1.1982,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7238689547581904,
+      "grad_norm": 0.6132529377937317,
+      "learning_rate": 0.0002,
+      "loss": 1.0931,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7301092043681747,
+      "grad_norm": 0.6523057222366333,
+      "learning_rate": 0.0002,
+      "loss": 1.0875,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7363494539781591,
+      "grad_norm": 0.6278107166290283,
+      "learning_rate": 0.0002,
+      "loss": 1.183,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7425897035881436,
+      "grad_norm": 0.9367479085922241,
+      "learning_rate": 0.0002,
+      "loss": 1.2556,
+      "step": 1190
+    },
+    {
+      "epoch": 0.748829953198128,
+      "grad_norm": 0.6311790943145752,
+      "learning_rate": 0.0002,
+      "loss": 1.2329,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7550702028081123,
+      "grad_norm": 0.594434916973114,
+      "learning_rate": 0.0002,
+      "loss": 1.1126,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7613104524180967,
+      "grad_norm": 0.7832707166671753,
+      "learning_rate": 0.0002,
+      "loss": 1.126,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7675507020280812,
+      "grad_norm": 0.6111505627632141,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7737909516380655,
+      "grad_norm": 0.6673868894577026,
+      "learning_rate": 0.0002,
+      "loss": 1.1395,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7800312012480499,
+      "grad_norm": 0.740943431854248,
+      "learning_rate": 0.0002,
+      "loss": 1.1455,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7862714508580343,
+      "grad_norm": 0.6874880790710449,
+      "learning_rate": 0.0002,
+      "loss": 1.2677,
+      "step": 1260
+    },
+    {
+      "epoch": 0.7925117004680188,
+      "grad_norm": 0.6566919684410095,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 1270
+    },
+    {
+      "epoch": 0.7987519500780031,
+      "grad_norm": 0.5894289612770081,
+      "learning_rate": 0.0002,
+      "loss": 1.1479,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8049921996879875,
+      "grad_norm": 0.8081681132316589,
+      "learning_rate": 0.0002,
+      "loss": 1.2025,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8112324492979719,
+      "grad_norm": 0.6353256106376648,
+      "learning_rate": 0.0002,
+      "loss": 1.1218,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8174726989079563,
+      "grad_norm": 0.5706279277801514,
+      "learning_rate": 0.0002,
+      "loss": 1.2244,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8237129485179407,
+      "grad_norm": 0.5903506278991699,
+      "learning_rate": 0.0002,
+      "loss": 1.1468,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8299531981279251,
+      "grad_norm": 0.5297395586967468,
+      "learning_rate": 0.0002,
+      "loss": 1.1757,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8361934477379095,
+      "grad_norm": 0.6525882482528687,
+      "learning_rate": 0.0002,
+      "loss": 1.0841,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8424336973478939,
+      "grad_norm": 0.8756698966026306,
+      "learning_rate": 0.0002,
+      "loss": 1.194,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8486739469578783,
+      "grad_norm": 1.0452430248260498,
+      "learning_rate": 0.0002,
+      "loss": 1.2273,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8549141965678627,
+      "grad_norm": 0.6416711807250977,
+      "learning_rate": 0.0002,
+      "loss": 1.0973,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8611544461778471,
+      "grad_norm": 0.49327632784843445,
+      "learning_rate": 0.0002,
+      "loss": 1.2116,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8673946957878315,
+      "grad_norm": 0.6748191714286804,
+      "learning_rate": 0.0002,
+      "loss": 1.1374,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8736349453978159,
+      "grad_norm": 1.153362512588501,
+      "learning_rate": 0.0002,
+      "loss": 1.2677,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8798751950078003,
+      "grad_norm": 0.8151076436042786,
+      "learning_rate": 0.0002,
+      "loss": 1.1562,
+      "step": 1410
+    },
+    {
+      "epoch": 0.8861154446177847,
+      "grad_norm": 1.00041925907135,
+      "learning_rate": 0.0002,
+      "loss": 1.1389,
+      "step": 1420
+    },
+    {
+      "epoch": 0.8923556942277691,
+      "grad_norm": 0.7730890512466431,
+      "learning_rate": 0.0002,
+      "loss": 1.2937,
+      "step": 1430
+    },
+    {
+      "epoch": 0.8985959438377535,
+      "grad_norm": 0.5944583415985107,
+      "learning_rate": 0.0002,
+      "loss": 1.1201,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9048361934477379,
+      "grad_norm": 1.2791529893875122,
+      "learning_rate": 0.0002,
+      "loss": 1.055,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9110764430577223,
+      "grad_norm": 0.6362823843955994,
+      "learning_rate": 0.0002,
+      "loss": 1.1659,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9173166926677067,
+      "grad_norm": 0.5871922969818115,
+      "learning_rate": 0.0002,
+      "loss": 1.1327,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9235569422776911,
+      "grad_norm": 0.8008357286453247,
+      "learning_rate": 0.0002,
+      "loss": 1.1736,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9297971918876755,
+      "grad_norm": 0.89249587059021,
+      "learning_rate": 0.0002,
+      "loss": 1.1132,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9360374414976599,
+      "grad_norm": 0.7067795395851135,
+      "learning_rate": 0.0002,
+      "loss": 1.1981,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9422776911076443,
+      "grad_norm": 0.8353140354156494,
+      "learning_rate": 0.0002,
+      "loss": 1.2021,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9485179407176287,
+      "grad_norm": 0.7982086539268494,
+      "learning_rate": 0.0002,
+      "loss": 1.1552,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9547581903276131,
+      "grad_norm": 0.6895356178283691,
+      "learning_rate": 0.0002,
+      "loss": 1.1823,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9609984399375975,
+      "grad_norm": 0.6823073029518127,
+      "learning_rate": 0.0002,
+      "loss": 1.1949,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9672386895475819,
+      "grad_norm": 0.6817622184753418,
+      "learning_rate": 0.0002,
+      "loss": 1.0287,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9734789391575663,
+      "grad_norm": 2.095001459121704,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 1560
+    },
+    {
+      "epoch": 0.9797191887675507,
+      "grad_norm": 1.0685795545578003,
+      "learning_rate": 0.0002,
+      "loss": 1.1759,
+      "step": 1570
+    },
+    {
+      "epoch": 0.9859594383775351,
+      "grad_norm": 0.6668464541435242,
+      "learning_rate": 0.0002,
+      "loss": 1.2076,
+      "step": 1580
+    },
+    {
+      "epoch": 0.9921996879875195,
+      "grad_norm": 0.6560982465744019,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 1590
+    },
+    {
+      "epoch": 0.9984399375975039,
+      "grad_norm": 0.634287416934967,
+      "learning_rate": 0.0002,
+      "loss": 1.1209,
+      "step": 1600
+    },
+    {
+      "epoch": 0.9996879875195008,
+      "eval_loss": 1.1827317476272583,
+      "eval_runtime": 63.566,
+      "eval_samples_per_second": 7.174,
+      "eval_steps_per_second": 0.897,
+      "step": 1602
+    },
+    {
+      "epoch": 1.0046801872074882,
+      "grad_norm": 0.7423340678215027,
+      "learning_rate": 0.0002,
+      "loss": 1.0877,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0109204368174727,
+      "grad_norm": 0.7822794318199158,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0171606864274572,
+      "grad_norm": 0.7621238231658936,
+      "learning_rate": 0.0002,
+      "loss": 1.0402,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0234009360374414,
+      "grad_norm": 1.121740698814392,
+      "learning_rate": 0.0002,
+      "loss": 1.0718,
+      "step": 1640
+    },
+    {
+      "epoch": 1.029641185647426,
+      "grad_norm": 0.8615530133247375,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0358814352574104,
+      "grad_norm": 0.9135850667953491,
+      "learning_rate": 0.0002,
+      "loss": 0.9926,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0421216848673946,
+      "grad_norm": 0.6709032654762268,
+      "learning_rate": 0.0002,
+      "loss": 1.0029,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0483619344773791,
+      "grad_norm": 0.8652639389038086,
+      "learning_rate": 0.0002,
+      "loss": 1.1006,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0546021840873634,
+      "grad_norm": 0.700859546661377,
+      "learning_rate": 0.0002,
+      "loss": 1.0518,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0608424336973479,
+      "grad_norm": 0.8015341758728027,
+      "learning_rate": 0.0002,
+      "loss": 0.9465,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0670826833073324,
+      "grad_norm": 0.8015555739402771,
+      "learning_rate": 0.0002,
+      "loss": 0.9767,
+      "step": 1710
+    },
+    {
+      "epoch": 1.0733229329173166,
+      "grad_norm": 0.7884244918823242,
+      "learning_rate": 0.0002,
+      "loss": 0.9956,
+      "step": 1720
+    },
+    {
+      "epoch": 1.079563182527301,
+      "grad_norm": 0.5757933855056763,
+      "learning_rate": 0.0002,
+      "loss": 1.1189,
+      "step": 1730
+    },
+    {
+      "epoch": 1.0858034321372856,
+      "grad_norm": 0.7748220562934875,
+      "learning_rate": 0.0002,
+      "loss": 1.0523,
+      "step": 1740
+    },
+    {
+      "epoch": 1.0920436817472698,
+      "grad_norm": 0.7868158221244812,
+      "learning_rate": 0.0002,
+      "loss": 1.0408,
+      "step": 1750
+    },
+    {
+      "epoch": 1.0982839313572543,
+      "grad_norm": 0.8389782309532166,
+      "learning_rate": 0.0002,
+      "loss": 1.0377,
+      "step": 1760
+    },
+    {
+      "epoch": 1.1045241809672386,
+      "grad_norm": 0.7349454760551453,
+      "learning_rate": 0.0002,
+      "loss": 1.0462,
+      "step": 1770
+    },
+    {
+      "epoch": 1.110764430577223,
+      "grad_norm": 0.928422749042511,
+      "learning_rate": 0.0002,
+      "loss": 1.0775,
+      "step": 1780
+    },
+    {
+      "epoch": 1.1170046801872076,
+      "grad_norm": 1.3305788040161133,
+      "learning_rate": 0.0002,
+      "loss": 1.1144,
+      "step": 1790
+    },
+    {
+      "epoch": 1.1232449297971918,
+      "grad_norm": 0.7305368185043335,
+      "learning_rate": 0.0002,
+      "loss": 1.0446,
+      "step": 1800
+    },
+    {
+      "epoch": 1.1294851794071763,
+      "grad_norm": 1.2106877565383911,
+      "learning_rate": 0.0002,
+      "loss": 1.0028,
+      "step": 1810
+    },
+    {
+      "epoch": 1.1357254290171608,
+      "grad_norm": 0.8563781380653381,
+      "learning_rate": 0.0002,
+      "loss": 1.0587,
+      "step": 1820
+    },
+    {
+      "epoch": 1.141965678627145,
+      "grad_norm": 0.8453772068023682,
+      "learning_rate": 0.0002,
+      "loss": 1.0901,
+      "step": 1830
+    },
+    {
+      "epoch": 1.1482059282371295,
+      "grad_norm": 0.7827452421188354,
+      "learning_rate": 0.0002,
+      "loss": 1.1149,
+      "step": 1840
+    },
+    {
+      "epoch": 1.154446177847114,
+      "grad_norm": 0.7457599639892578,
+      "learning_rate": 0.0002,
+      "loss": 1.0693,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1606864274570983,
+      "grad_norm": 0.7619354724884033,
+      "learning_rate": 0.0002,
+      "loss": 0.9426,
+      "step": 1860
+    },
+    {
+      "epoch": 1.1669266770670828,
+      "grad_norm": 0.7766537666320801,
+      "learning_rate": 0.0002,
+      "loss": 1.0504,
+      "step": 1870
+    },
+    {
+      "epoch": 1.173166926677067,
+      "grad_norm": 0.6223660111427307,
+      "learning_rate": 0.0002,
+      "loss": 1.0635,
+      "step": 1880
+    },
+    {
+      "epoch": 1.1794071762870515,
+      "grad_norm": 0.7259079217910767,
+      "learning_rate": 0.0002,
+      "loss": 0.9955,
+      "step": 1890
+    },
+    {
+      "epoch": 1.185647425897036,
+      "grad_norm": 1.0263047218322754,
+      "learning_rate": 0.0002,
+      "loss": 1.0457,
+      "step": 1900
+    },
+    {
+      "epoch": 1.1918876755070202,
+      "grad_norm": 0.9251036643981934,
+      "learning_rate": 0.0002,
+      "loss": 1.0626,
+      "step": 1910
+    },
+    {
+      "epoch": 1.1981279251170047,
+      "grad_norm": 0.6675537824630737,
+      "learning_rate": 0.0002,
+      "loss": 0.994,
+      "step": 1920
+    },
+    {
+      "epoch": 1.204368174726989,
+      "grad_norm": 1.1418719291687012,
+      "learning_rate": 0.0002,
+      "loss": 1.0163,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2106084243369735,
+      "grad_norm": 0.6566996574401855,
+      "learning_rate": 0.0002,
+      "loss": 1.0065,
+      "step": 1940
+    },
+    {
+      "epoch": 1.216848673946958,
+      "grad_norm": 0.8776164650917053,
+      "learning_rate": 0.0002,
+      "loss": 1.1006,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2230889235569422,
+      "grad_norm": 0.8911651968955994,
+      "learning_rate": 0.0002,
+      "loss": 0.9612,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2293291731669267,
+      "grad_norm": 0.9851633906364441,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 1970
+    },
+    {
+      "epoch": 1.2355694227769112,
+      "grad_norm": 0.8044199347496033,
+      "learning_rate": 0.0002,
+      "loss": 0.9887,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2418096723868954,
+      "grad_norm": 0.7628118395805359,
+      "learning_rate": 0.0002,
+      "loss": 1.0618,
+      "step": 1990
+    },
+    {
+      "epoch": 1.24804992199688,
+      "grad_norm": 1.2210139036178589,
+      "learning_rate": 0.0002,
+      "loss": 1.0252,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2542901716068644,
+      "grad_norm": 0.7742480039596558,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 2010
+    },
+    {
+      "epoch": 1.2605304212168487,
+      "grad_norm": 0.8192761540412903,
+      "learning_rate": 0.0002,
+      "loss": 1.0437,
+      "step": 2020
+    },
+    {
+      "epoch": 1.2667706708268331,
+      "grad_norm": 0.7866397500038147,
+      "learning_rate": 0.0002,
+      "loss": 1.0017,
+      "step": 2030
+    },
+    {
+      "epoch": 1.2730109204368174,
+      "grad_norm": 0.7491360306739807,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 2040
+    },
+    {
+      "epoch": 1.2792511700468019,
+      "grad_norm": 0.9893434047698975,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 2050
+    },
+    {
+      "epoch": 1.2854914196567861,
+      "grad_norm": 0.7714972496032715,
+      "learning_rate": 0.0002,
+      "loss": 0.9595,
+      "step": 2060
+    },
+    {
+      "epoch": 1.2917316692667706,
+      "grad_norm": 0.7672552466392517,
+      "learning_rate": 0.0002,
+      "loss": 0.9945,
+      "step": 2070
+    },
+    {
+      "epoch": 1.2979719188767551,
+      "grad_norm": 1.8601958751678467,
+      "learning_rate": 0.0002,
+      "loss": 0.9524,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3042121684867394,
+      "grad_norm": 0.7728857398033142,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3104524180967239,
+      "grad_norm": 1.0262086391448975,
+      "learning_rate": 0.0002,
+      "loss": 1.0778,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3166926677067083,
+      "grad_norm": 1.1083186864852905,
+      "learning_rate": 0.0002,
+      "loss": 1.007,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3229329173166926,
+      "grad_norm": 0.9694948792457581,
+      "learning_rate": 0.0002,
+      "loss": 0.9547,
+      "step": 2120
+    },
+    {
+      "epoch": 1.329173166926677,
+      "grad_norm": 0.8996708989143372,
+      "learning_rate": 0.0002,
+      "loss": 1.137,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3354134165366616,
+      "grad_norm": 0.828211784362793,
+      "learning_rate": 0.0002,
+      "loss": 1.013,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3416536661466458,
+      "grad_norm": 0.7998611927032471,
+      "learning_rate": 0.0002,
+      "loss": 1.0333,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3478939157566303,
+      "grad_norm": 1.2462408542633057,
+      "learning_rate": 0.0002,
+      "loss": 1.0579,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3541341653666148,
+      "grad_norm": 0.7298257350921631,
+      "learning_rate": 0.0002,
+      "loss": 0.9858,
+      "step": 2170
+    },
+    {
+      "epoch": 1.360374414976599,
+      "grad_norm": 1.0242233276367188,
+      "learning_rate": 0.0002,
+      "loss": 1.0929,
+      "step": 2180
+    },
+    {
+      "epoch": 1.3666146645865835,
+      "grad_norm": 0.6309589743614197,
+      "learning_rate": 0.0002,
+      "loss": 0.9951,
+      "step": 2190
+    },
+    {
+      "epoch": 1.3728549141965678,
+      "grad_norm": 0.7779918313026428,
+      "learning_rate": 0.0002,
+      "loss": 0.9605,
+      "step": 2200
+    },
+    {
+      "epoch": 1.3790951638065523,
+      "grad_norm": 0.8241073489189148,
+      "learning_rate": 0.0002,
+      "loss": 1.1227,
+      "step": 2210
+    },
+    {
+      "epoch": 1.3853354134165365,
+      "grad_norm": 0.9407224655151367,
+      "learning_rate": 0.0002,
+      "loss": 0.9911,
+      "step": 2220
+    },
+    {
+      "epoch": 1.391575663026521,
+      "grad_norm": 0.7646933197975159,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 2230
+    },
+    {
+      "epoch": 1.3978159126365055,
+      "grad_norm": 0.951589047908783,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4040561622464898,
+      "grad_norm": 0.7215362787246704,
+      "learning_rate": 0.0002,
+      "loss": 1.0835,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4102964118564743,
+      "grad_norm": 0.8220779895782471,
+      "learning_rate": 0.0002,
+      "loss": 0.9895,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4165366614664587,
+      "grad_norm": 1.6396559476852417,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 2270
+    },
+    {
+      "epoch": 1.422776911076443,
+      "grad_norm": 0.9652542471885681,
+      "learning_rate": 0.0002,
+      "loss": 0.944,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4290171606864275,
+      "grad_norm": 0.9859839081764221,
+      "learning_rate": 0.0002,
+      "loss": 0.9559,
+      "step": 2290
+    },
+    {
+      "epoch": 1.435257410296412,
+      "grad_norm": 1.1837399005889893,
+      "learning_rate": 0.0002,
+      "loss": 1.1192,
+      "step": 2300
+    },
+    {
+      "epoch": 1.4414976599063962,
+      "grad_norm": 1.1094632148742676,
+      "learning_rate": 0.0002,
+      "loss": 1.0207,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4477379095163807,
+      "grad_norm": 1.0111202001571655,
+      "learning_rate": 0.0002,
+      "loss": 1.0406,
+      "step": 2320
+    },
+    {
+      "epoch": 1.4539781591263652,
+      "grad_norm": 0.8623846769332886,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 2330
+    },
+    {
+      "epoch": 1.4602184087363494,
+      "grad_norm": 0.9365978240966797,
+      "learning_rate": 0.0002,
+      "loss": 1.0827,
+      "step": 2340
+    },
+    {
+      "epoch": 1.466458658346334,
+      "grad_norm": 0.7573872208595276,
+      "learning_rate": 0.0002,
+      "loss": 1.0943,
+      "step": 2350
+    },
+    {
+      "epoch": 1.4726989079563182,
+      "grad_norm": 0.7914007902145386,
+      "learning_rate": 0.0002,
+      "loss": 1.0955,
+      "step": 2360
+    },
+    {
+      "epoch": 1.4789391575663027,
+      "grad_norm": 0.7893397212028503,
+      "learning_rate": 0.0002,
+      "loss": 1.0029,
+      "step": 2370
+    },
+    {
+      "epoch": 1.485179407176287,
+      "grad_norm": 0.9740095734596252,
+      "learning_rate": 0.0002,
+      "loss": 1.0441,
+      "step": 2380
+    },
+    {
+      "epoch": 1.4914196567862714,
+      "grad_norm": 1.1725326776504517,
+      "learning_rate": 0.0002,
+      "loss": 1.0301,
+      "step": 2390
+    },
+    {
+      "epoch": 1.497659906396256,
+      "grad_norm": 0.8515211343765259,
+      "learning_rate": 0.0002,
+      "loss": 1.0886,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5039001560062402,
+      "grad_norm": 0.9571354389190674,
+      "learning_rate": 0.0002,
+      "loss": 1.0554,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5101404056162246,
+      "grad_norm": 0.7786742448806763,
+      "learning_rate": 0.0002,
+      "loss": 1.0165,
+      "step": 2420
+    },
+    {
+      "epoch": 1.5163806552262091,
+      "grad_norm": 1.2387847900390625,
+      "learning_rate": 0.0002,
+      "loss": 1.007,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5226209048361934,
+      "grad_norm": 0.9893562197685242,
+      "learning_rate": 0.0002,
+      "loss": 1.0641,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5288611544461779,
+      "grad_norm": 0.8624372482299805,
+      "learning_rate": 0.0002,
+      "loss": 1.0714,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5351014040561624,
+      "grad_norm": 0.7481670379638672,
+      "learning_rate": 0.0002,
+      "loss": 1.0145,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5413416536661466,
+      "grad_norm": 0.7324886322021484,
+      "learning_rate": 0.0002,
+      "loss": 0.9921,
+      "step": 2470
+    },
+    {
+      "epoch": 1.547581903276131,
+      "grad_norm": 1.1707918643951416,
+      "learning_rate": 0.0002,
+      "loss": 1.0344,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5538221528861156,
+      "grad_norm": 0.8224479556083679,
+      "learning_rate": 0.0002,
+      "loss": 0.9879,
+      "step": 2490
+    },
+    {
+      "epoch": 1.5600624024960998,
+      "grad_norm": 0.9338988661766052,
+      "learning_rate": 0.0002,
+      "loss": 1.0354,
+      "step": 2500
+    },
+    {
+      "epoch": 1.566302652106084,
+      "grad_norm": 0.9094964265823364,
+      "learning_rate": 0.0002,
+      "loss": 1.1097,
+      "step": 2510
+    },
+    {
+      "epoch": 1.5725429017160688,
+      "grad_norm": 0.7565743327140808,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 2520
+    },
+    {
+      "epoch": 1.578783151326053,
+      "grad_norm": 0.8102707266807556,
+      "learning_rate": 0.0002,
+      "loss": 1.0805,
+      "step": 2530
+    },
+    {
+      "epoch": 1.5850234009360373,
+      "grad_norm": 0.7755123972892761,
+      "learning_rate": 0.0002,
+      "loss": 0.9721,
+      "step": 2540
+    },
+    {
+      "epoch": 1.5912636505460218,
+      "grad_norm": 0.7241401672363281,
+      "learning_rate": 0.0002,
+      "loss": 1.0251,
+      "step": 2550
+    },
+    {
+      "epoch": 1.5975039001560063,
+      "grad_norm": 1.2833902835845947,
+      "learning_rate": 0.0002,
+      "loss": 1.0339,
+      "step": 2560
+    },
+    {
+      "epoch": 1.6037441497659906,
+      "grad_norm": 0.9504060745239258,
+      "learning_rate": 0.0002,
+      "loss": 1.0239,
+      "step": 2570
+    },
+    {
+      "epoch": 1.609984399375975,
+      "grad_norm": 0.8388465046882629,
+      "learning_rate": 0.0002,
+      "loss": 1.1343,
+      "step": 2580
+    },
+    {
+      "epoch": 1.6162246489859595,
+      "grad_norm": 0.7967075109481812,
+      "learning_rate": 0.0002,
+      "loss": 0.9885,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6224648985959438,
+      "grad_norm": 0.7983989119529724,
+      "learning_rate": 0.0002,
+      "loss": 1.075,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6287051482059283,
+      "grad_norm": 0.9089146852493286,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 2610
+    },
+    {
+      "epoch": 1.6349453978159127,
+      "grad_norm": 0.8271947503089905,
+      "learning_rate": 0.0002,
+      "loss": 1.043,
+      "step": 2620
+    },
+    {
+      "epoch": 1.641185647425897,
+      "grad_norm": 1.0343270301818848,
+      "learning_rate": 0.0002,
+      "loss": 1.0405,
+      "step": 2630
+    },
+    {
+      "epoch": 1.6474258970358813,
+      "grad_norm": 0.8620867729187012,
+      "learning_rate": 0.0002,
+      "loss": 0.9998,
+      "step": 2640
+    },
+    {
+      "epoch": 1.653666146645866,
+      "grad_norm": 1.0815283060073853,
+      "learning_rate": 0.0002,
+      "loss": 0.9758,
+      "step": 2650
+    },
+    {
+      "epoch": 1.6599063962558502,
+      "grad_norm": 0.7567992210388184,
+      "learning_rate": 0.0002,
+      "loss": 1.0074,
+      "step": 2660
+    },
+    {
+      "epoch": 1.6661466458658345,
+      "grad_norm": 0.9961640238761902,
+      "learning_rate": 0.0002,
+      "loss": 1.0115,
+      "step": 2670
+    },
+    {
+      "epoch": 1.672386895475819,
+      "grad_norm": 0.7986059188842773,
+      "learning_rate": 0.0002,
+      "loss": 1.0007,
+      "step": 2680
+    },
+    {
+      "epoch": 1.6786271450858035,
+      "grad_norm": 0.8693634271621704,
+      "learning_rate": 0.0002,
+      "loss": 1.0171,
+      "step": 2690
+    },
+    {
+      "epoch": 1.6848673946957877,
+      "grad_norm": 0.8925015926361084,
+      "learning_rate": 0.0002,
+      "loss": 1.0033,
+      "step": 2700
+    },
+    {
+      "epoch": 1.6911076443057722,
+      "grad_norm": 1.0562777519226074,
+      "learning_rate": 0.0002,
+      "loss": 0.9656,
+      "step": 2710
+    },
+    {
+      "epoch": 1.6973478939157567,
+      "grad_norm": 1.1360619068145752,
+      "learning_rate": 0.0002,
+      "loss": 1.0784,
+      "step": 2720
+    },
+    {
+      "epoch": 1.703588143525741,
+      "grad_norm": 0.8302593231201172,
+      "learning_rate": 0.0002,
+      "loss": 1.0338,
+      "step": 2730
+    },
+    {
+      "epoch": 1.7098283931357254,
+      "grad_norm": 0.8413597941398621,
+      "learning_rate": 0.0002,
+      "loss": 0.9493,
+      "step": 2740
+    },
+    {
+      "epoch": 1.71606864274571,
+      "grad_norm": 1.18924081325531,
+      "learning_rate": 0.0002,
+      "loss": 1.0412,
+      "step": 2750
+    },
+    {
+      "epoch": 1.7223088923556942,
+      "grad_norm": 0.7038135528564453,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 2760
+    },
+    {
+      "epoch": 1.7285491419656787,
+      "grad_norm": 0.9011945724487305,
+      "learning_rate": 0.0002,
+      "loss": 1.0257,
+      "step": 2770
+    },
+    {
+      "epoch": 1.7347893915756631,
+      "grad_norm": 0.9442067742347717,
+      "learning_rate": 0.0002,
+      "loss": 1.0581,
+      "step": 2780
+    },
+    {
+      "epoch": 1.7410296411856474,
+      "grad_norm": 1.7989352941513062,
+      "learning_rate": 0.0002,
+      "loss": 1.1302,
+      "step": 2790
+    },
+    {
+      "epoch": 1.7472698907956317,
+      "grad_norm": 0.7309429049491882,
+      "learning_rate": 0.0002,
+      "loss": 1.0352,
+      "step": 2800
+    },
+    {
+      "epoch": 1.7535101404056164,
+      "grad_norm": 0.9080949425697327,
+      "learning_rate": 0.0002,
+      "loss": 1.0418,
+      "step": 2810
+    },
+    {
+      "epoch": 1.7597503900156006,
+      "grad_norm": 0.9498275518417358,
+      "learning_rate": 0.0002,
+      "loss": 1.0508,
+      "step": 2820
+    },
+    {
+      "epoch": 1.765990639625585,
+      "grad_norm": 1.1082345247268677,
+      "learning_rate": 0.0002,
+      "loss": 1.0089,
+      "step": 2830
+    },
+    {
+      "epoch": 1.7722308892355694,
+      "grad_norm": 1.093387484550476,
+      "learning_rate": 0.0002,
+      "loss": 1.0512,
+      "step": 2840
+    },
+    {
+      "epoch": 1.7784711388455539,
+      "grad_norm": 1.0876753330230713,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 2850
+    },
+    {
+      "epoch": 1.7847113884555381,
+      "grad_norm": 0.8042762279510498,
+      "learning_rate": 0.0002,
+      "loss": 0.9417,
+      "step": 2860
+    },
+    {
+      "epoch": 1.7909516380655226,
+      "grad_norm": 1.0242412090301514,
+      "learning_rate": 0.0002,
+      "loss": 1.1274,
+      "step": 2870
+    },
+    {
+      "epoch": 1.797191887675507,
+      "grad_norm": 0.7575710415840149,
+      "learning_rate": 0.0002,
+      "loss": 1.1423,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8034321372854913,
+      "grad_norm": 0.7589989304542542,
+      "learning_rate": 0.0002,
+      "loss": 1.0711,
+      "step": 2890
+    },
+    {
+      "epoch": 1.8096723868954758,
+      "grad_norm": 0.6194815635681152,
+      "learning_rate": 0.0002,
+      "loss": 0.9809,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8159126365054603,
+      "grad_norm": 1.4249097108840942,
+      "learning_rate": 0.0002,
+      "loss": 1.0815,
+      "step": 2910
+    },
+    {
+      "epoch": 1.8221528861154446,
+      "grad_norm": 1.001883625984192,
+      "learning_rate": 0.0002,
+      "loss": 0.9565,
+      "step": 2920
+    },
+    {
+      "epoch": 1.828393135725429,
+      "grad_norm": 1.1704825162887573,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8346333853354135,
+      "grad_norm": 0.7918996810913086,
+      "learning_rate": 0.0002,
+      "loss": 1.0302,
+      "step": 2940
+    },
+    {
+      "epoch": 1.8408736349453978,
+      "grad_norm": 0.8411404490470886,
+      "learning_rate": 0.0002,
+      "loss": 1.0239,
+      "step": 2950
+    },
+    {
+      "epoch": 1.847113884555382,
+      "grad_norm": 0.719024658203125,
+      "learning_rate": 0.0002,
+      "loss": 0.9858,
+      "step": 2960
+    },
+    {
+      "epoch": 1.8533541341653668,
+      "grad_norm": 0.9834994673728943,
+      "learning_rate": 0.0002,
+      "loss": 1.0562,
+      "step": 2970
+    },
+    {
+      "epoch": 1.859594383775351,
+      "grad_norm": 0.8373038172721863,
+      "learning_rate": 0.0002,
+      "loss": 1.0728,
+      "step": 2980
+    },
+    {
+      "epoch": 1.8658346333853353,
+      "grad_norm": 1.0496515035629272,
+      "learning_rate": 0.0002,
+      "loss": 1.0565,
+      "step": 2990
+    },
+    {
+      "epoch": 1.8720748829953198,
+      "grad_norm": 0.8380734324455261,
+      "learning_rate": 0.0002,
+      "loss": 1.0449,
+      "step": 3000
+    },
+    {
+      "epoch": 1.8783151326053042,
+      "grad_norm": 1.7618961334228516,
+      "learning_rate": 0.0002,
+      "loss": 1.0296,
+      "step": 3010
+    },
+    {
+      "epoch": 1.8845553822152885,
+      "grad_norm": 0.8019407391548157,
+      "learning_rate": 0.0002,
+      "loss": 0.9879,
+      "step": 3020
+    },
+    {
+      "epoch": 1.890795631825273,
+      "grad_norm": 0.8352335095405579,
+      "learning_rate": 0.0002,
+      "loss": 0.9934,
+      "step": 3030
+    },
+    {
+      "epoch": 1.8970358814352575,
+      "grad_norm": 1.481913685798645,
+      "learning_rate": 0.0002,
+      "loss": 0.992,
+      "step": 3040
+    },
+    {
+      "epoch": 1.9032761310452417,
+      "grad_norm": 1.1864397525787354,
+      "learning_rate": 0.0002,
+      "loss": 1.057,
+      "step": 3050
+    },
+    {
+      "epoch": 1.9095163806552262,
+      "grad_norm": 1.031194806098938,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 3060
+    },
+    {
+      "epoch": 1.9157566302652107,
+      "grad_norm": 1.823204755783081,
+      "learning_rate": 0.0002,
+      "loss": 1.0128,
+      "step": 3070
+    },
+    {
+      "epoch": 1.921996879875195,
+      "grad_norm": 1.3258085250854492,
+      "learning_rate": 0.0002,
+      "loss": 1.0053,
+      "step": 3080
+    },
+    {
+      "epoch": 1.9282371294851794,
+      "grad_norm": 0.7169067859649658,
+      "learning_rate": 0.0002,
+      "loss": 1.095,
+      "step": 3090
+    },
+    {
+      "epoch": 1.934477379095164,
+      "grad_norm": 0.8333500027656555,
+      "learning_rate": 0.0002,
+      "loss": 1.0648,
+      "step": 3100
+    },
+    {
+      "epoch": 1.9407176287051482,
+      "grad_norm": 0.842961311340332,
+      "learning_rate": 0.0002,
+      "loss": 1.1044,
+      "step": 3110
+    },
+    {
+      "epoch": 1.9469578783151325,
+      "grad_norm": 1.1449346542358398,
+      "learning_rate": 0.0002,
+      "loss": 1.0419,
+      "step": 3120
+    },
+    {
+      "epoch": 1.9531981279251172,
+      "grad_norm": 1.12788724899292,
+      "learning_rate": 0.0002,
+      "loss": 1.0028,
+      "step": 3130
+    },
+    {
+      "epoch": 1.9594383775351014,
+      "grad_norm": 0.7468876838684082,
+      "learning_rate": 0.0002,
+      "loss": 0.9775,
+      "step": 3140
+    },
+    {
+      "epoch": 1.9656786271450857,
+      "grad_norm": 1.1087535619735718,
+      "learning_rate": 0.0002,
+      "loss": 1.0093,
+      "step": 3150
+    },
+    {
+      "epoch": 1.9719188767550702,
+      "grad_norm": 0.7425413131713867,
+      "learning_rate": 0.0002,
+      "loss": 0.996,
+      "step": 3160
+    },
+    {
+      "epoch": 1.9781591263650546,
+      "grad_norm": 0.7973074316978455,
+      "learning_rate": 0.0002,
+      "loss": 0.9577,
+      "step": 3170
+    },
+    {
+      "epoch": 1.984399375975039,
+      "grad_norm": 0.9974947571754456,
+      "learning_rate": 0.0002,
+      "loss": 1.0181,
+      "step": 3180
+    },
+    {
+      "epoch": 1.9906396255850234,
+      "grad_norm": 1.1947362422943115,
+      "learning_rate": 0.0002,
+      "loss": 0.9786,
+      "step": 3190
+    },
+    {
+      "epoch": 1.9968798751950079,
+      "grad_norm": 1.0778043270111084,
+      "learning_rate": 0.0002,
+      "loss": 1.0453,
+      "step": 3200
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1564315557479858,
+      "eval_runtime": 33.4296,
+      "eval_samples_per_second": 13.641,
+      "eval_steps_per_second": 1.705,
+      "step": 3205
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 12816,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.40618499096576e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}