MilaWang commited on Mar 28, 2025

Commit

e02d45c

verified ·

1 Parent(s): 4e3a6e4

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/trainer_state.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10108/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/trainer_state.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-11552/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/trainer_state.json +1049 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888/trainer_state.json +2065 -0

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c86b8a21be6e05311e63fbffeac1c2bf5c4d45b9a95e9c4201ec11b2894fa609
+size 109069176

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbce5ac5c82a1c0d80aacae597c3e913908fb2d4a979ef34ce35351d3e8a0bec
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3d9082560c8250426ea85a0e59a1c8960fc3bfabe27242a3b3f518330616c6a
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:279a749e6cfb1537df6cedb3acc0ba18b7f53b54dcfa745853407738a0a8146e
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a19515066be034f924e3a3510d2ffc822e3480fa7e99f72717b7c79d2ec35c97
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38753c24fe64a0515c822cd3ac60a6d32e8f10efb3ee119d169c891f359f9ca8
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ee5345beb0feb7902eaab0d96f68b2a2538442273ea6974db9aa270b1a6d112
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9aa70dcc6846b149a8dda56e93001484f5c6882d342002e3559f5ec0c01cab6
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cac4dd97edcaf09ee346fd3fa49f1b212e1a1f425281470aee43b9f3f81ad9e6
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:439a60efe80e9149f11d9c4114922855b2c255ef0449ec34dcc692241eb5afd4
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38753c24fe64a0515c822cd3ac60a6d32e8f10efb3ee119d169c891f359f9ca8
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88885cee5c57252b1809b1cfbb8694fe748b60a7ea792d5080104034383ba33c
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:136b7b3959de0e3e059332ef55c898ceb0d13be175dac3b256e949ffd16a1c23
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5aa91f7997973180e5859930b12f0aba3ec7a8cc1c6aa6faa9e4364952f6e20
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a62841eef9f087332a3100dcc5fd344ca152691acbbfbfc6a7373e36c14ac44c
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1049 @@

+{
+  "best_metric": 1.1866850852966309,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1444",
+  "epoch": 1.0,
+  "eval_steps": 10,
+  "global_step": 1444,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006925207756232687,
+      "grad_norm": 1.0920530557632446,
+      "learning_rate": 0.0002,
+      "loss": 1.8528,
+      "step": 10
+    },
+    {
+      "epoch": 0.013850415512465374,
+      "grad_norm": 0.9524772763252258,
+      "learning_rate": 0.0002,
+      "loss": 1.644,
+      "step": 20
+    },
+    {
+      "epoch": 0.02077562326869806,
+      "grad_norm": 1.0657835006713867,
+      "learning_rate": 0.0002,
+      "loss": 1.5746,
+      "step": 30
+    },
+    {
+      "epoch": 0.027700831024930747,
+      "grad_norm": 0.8247288465499878,
+      "learning_rate": 0.0002,
+      "loss": 1.5292,
+      "step": 40
+    },
+    {
+      "epoch": 0.03462603878116344,
+      "grad_norm": 0.7946091890335083,
+      "learning_rate": 0.0002,
+      "loss": 1.4266,
+      "step": 50
+    },
+    {
+      "epoch": 0.04155124653739612,
+      "grad_norm": 0.9081670045852661,
+      "learning_rate": 0.0002,
+      "loss": 1.4128,
+      "step": 60
+    },
+    {
+      "epoch": 0.04847645429362881,
+      "grad_norm": 0.6268877387046814,
+      "learning_rate": 0.0002,
+      "loss": 1.3102,
+      "step": 70
+    },
+    {
+      "epoch": 0.055401662049861494,
+      "grad_norm": 0.7338827252388,
+      "learning_rate": 0.0002,
+      "loss": 1.1669,
+      "step": 80
+    },
+    {
+      "epoch": 0.062326869806094184,
+      "grad_norm": 0.7672784924507141,
+      "learning_rate": 0.0002,
+      "loss": 1.3046,
+      "step": 90
+    },
+    {
+      "epoch": 0.06925207756232687,
+      "grad_norm": 0.6481738090515137,
+      "learning_rate": 0.0002,
+      "loss": 1.2472,
+      "step": 100
+    },
+    {
+      "epoch": 0.07617728531855955,
+      "grad_norm": 0.8287441730499268,
+      "learning_rate": 0.0002,
+      "loss": 1.2622,
+      "step": 110
+    },
+    {
+      "epoch": 0.08310249307479224,
+      "grad_norm": 0.8505423665046692,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 120
+    },
+    {
+      "epoch": 0.09002770083102493,
+      "grad_norm": 0.7897729873657227,
+      "learning_rate": 0.0002,
+      "loss": 1.2878,
+      "step": 130
+    },
+    {
+      "epoch": 0.09695290858725762,
+      "grad_norm": 0.5853613018989563,
+      "learning_rate": 0.0002,
+      "loss": 1.1961,
+      "step": 140
+    },
+    {
+      "epoch": 0.1038781163434903,
+      "grad_norm": 0.8110899925231934,
+      "learning_rate": 0.0002,
+      "loss": 1.1575,
+      "step": 150
+    },
+    {
+      "epoch": 0.11080332409972299,
+      "grad_norm": 0.7012475728988647,
+      "learning_rate": 0.0002,
+      "loss": 1.2082,
+      "step": 160
+    },
+    {
+      "epoch": 0.11772853185595568,
+      "grad_norm": 0.6915570497512817,
+      "learning_rate": 0.0002,
+      "loss": 1.2404,
+      "step": 170
+    },
+    {
+      "epoch": 0.12465373961218837,
+      "grad_norm": 0.6198431849479675,
+      "learning_rate": 0.0002,
+      "loss": 1.1986,
+      "step": 180
+    },
+    {
+      "epoch": 0.13157894736842105,
+      "grad_norm": 0.6338268518447876,
+      "learning_rate": 0.0002,
+      "loss": 1.2419,
+      "step": 190
+    },
+    {
+      "epoch": 0.13850415512465375,
+      "grad_norm": 0.6599787473678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1719,
+      "step": 200
+    },
+    {
+      "epoch": 0.14542936288088643,
+      "grad_norm": 0.5243592262268066,
+      "learning_rate": 0.0002,
+      "loss": 1.2069,
+      "step": 210
+    },
+    {
+      "epoch": 0.1523545706371191,
+      "grad_norm": 0.7506922483444214,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 220
+    },
+    {
+      "epoch": 0.1592797783933518,
+      "grad_norm": 0.7365843057632446,
+      "learning_rate": 0.0002,
+      "loss": 1.1774,
+      "step": 230
+    },
+    {
+      "epoch": 0.16620498614958448,
+      "grad_norm": 0.6041411757469177,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 240
+    },
+    {
+      "epoch": 0.1731301939058172,
+      "grad_norm": 0.5634334683418274,
+      "learning_rate": 0.0002,
+      "loss": 1.252,
+      "step": 250
+    },
+    {
+      "epoch": 0.18005540166204986,
+      "grad_norm": 0.5572287440299988,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 260
+    },
+    {
+      "epoch": 0.18698060941828254,
+      "grad_norm": 0.8472719788551331,
+      "learning_rate": 0.0002,
+      "loss": 1.1742,
+      "step": 270
+    },
+    {
+      "epoch": 0.19390581717451524,
+      "grad_norm": 0.6819698810577393,
+      "learning_rate": 0.0002,
+      "loss": 1.1148,
+      "step": 280
+    },
+    {
+      "epoch": 0.20083102493074792,
+      "grad_norm": 0.6842968463897705,
+      "learning_rate": 0.0002,
+      "loss": 1.199,
+      "step": 290
+    },
+    {
+      "epoch": 0.2077562326869806,
+      "grad_norm": 0.5447699427604675,
+      "learning_rate": 0.0002,
+      "loss": 1.1587,
+      "step": 300
+    },
+    {
+      "epoch": 0.2146814404432133,
+      "grad_norm": 0.5943161845207214,
+      "learning_rate": 0.0002,
+      "loss": 1.2508,
+      "step": 310
+    },
+    {
+      "epoch": 0.22160664819944598,
+      "grad_norm": 1.111795425415039,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 320
+    },
+    {
+      "epoch": 0.22853185595567868,
+      "grad_norm": 0.6633310317993164,
+      "learning_rate": 0.0002,
+      "loss": 1.1938,
+      "step": 330
+    },
+    {
+      "epoch": 0.23545706371191136,
+      "grad_norm": 2.072023868560791,
+      "learning_rate": 0.0002,
+      "loss": 1.2189,
+      "step": 340
+    },
+    {
+      "epoch": 0.24238227146814403,
+      "grad_norm": 0.784428060054779,
+      "learning_rate": 0.0002,
+      "loss": 1.2753,
+      "step": 350
+    },
+    {
+      "epoch": 0.24930747922437674,
+      "grad_norm": 0.5437638759613037,
+      "learning_rate": 0.0002,
+      "loss": 1.2269,
+      "step": 360
+    },
+    {
+      "epoch": 0.2562326869806094,
+      "grad_norm": 0.4859085977077484,
+      "learning_rate": 0.0002,
+      "loss": 1.1403,
+      "step": 370
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 1.659372091293335,
+      "learning_rate": 0.0002,
+      "loss": 1.1949,
+      "step": 380
+    },
+    {
+      "epoch": 0.27008310249307477,
+      "grad_norm": 0.7998494505882263,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 390
+    },
+    {
+      "epoch": 0.2770083102493075,
+      "grad_norm": 0.6909512877464294,
+      "learning_rate": 0.0002,
+      "loss": 1.2495,
+      "step": 400
+    },
+    {
+      "epoch": 0.2839335180055402,
+      "grad_norm": 0.7771576642990112,
+      "learning_rate": 0.0002,
+      "loss": 1.1665,
+      "step": 410
+    },
+    {
+      "epoch": 0.29085872576177285,
+      "grad_norm": 0.5854765176773071,
+      "learning_rate": 0.0002,
+      "loss": 1.2721,
+      "step": 420
+    },
+    {
+      "epoch": 0.29778393351800553,
+      "grad_norm": 0.5054845213890076,
+      "learning_rate": 0.0002,
+      "loss": 1.2623,
+      "step": 430
+    },
+    {
+      "epoch": 0.3047091412742382,
+      "grad_norm": 0.4726078510284424,
+      "learning_rate": 0.0002,
+      "loss": 1.2714,
+      "step": 440
+    },
+    {
+      "epoch": 0.31163434903047094,
+      "grad_norm": 0.8064747452735901,
+      "learning_rate": 0.0002,
+      "loss": 1.2929,
+      "step": 450
+    },
+    {
+      "epoch": 0.3185595567867036,
+      "grad_norm": 1.1799591779708862,
+      "learning_rate": 0.0002,
+      "loss": 1.2516,
+      "step": 460
+    },
+    {
+      "epoch": 0.3254847645429363,
+      "grad_norm": 0.580531656742096,
+      "learning_rate": 0.0002,
+      "loss": 1.137,
+      "step": 470
+    },
+    {
+      "epoch": 0.33240997229916897,
+      "grad_norm": 0.7213053703308105,
+      "learning_rate": 0.0002,
+      "loss": 1.3027,
+      "step": 480
+    },
+    {
+      "epoch": 0.33933518005540164,
+      "grad_norm": 0.8764265775680542,
+      "learning_rate": 0.0002,
+      "loss": 1.3223,
+      "step": 490
+    },
+    {
+      "epoch": 0.3462603878116344,
+      "grad_norm": 0.539807140827179,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 500
+    },
+    {
+      "epoch": 0.35318559556786705,
+      "grad_norm": 0.5516729950904846,
+      "learning_rate": 0.0002,
+      "loss": 1.238,
+      "step": 510
+    },
+    {
+      "epoch": 0.3601108033240997,
+      "grad_norm": 0.6443287134170532,
+      "learning_rate": 0.0002,
+      "loss": 1.1304,
+      "step": 520
+    },
+    {
+      "epoch": 0.3670360110803324,
+      "grad_norm": 0.5140393972396851,
+      "learning_rate": 0.0002,
+      "loss": 1.2379,
+      "step": 530
+    },
+    {
+      "epoch": 0.3739612188365651,
+      "grad_norm": 0.4929957389831543,
+      "learning_rate": 0.0002,
+      "loss": 1.2503,
+      "step": 540
+    },
+    {
+      "epoch": 0.3808864265927978,
+      "grad_norm": 0.5621340870857239,
+      "learning_rate": 0.0002,
+      "loss": 1.1855,
+      "step": 550
+    },
+    {
+      "epoch": 0.3878116343490305,
+      "grad_norm": 0.4839057922363281,
+      "learning_rate": 0.0002,
+      "loss": 1.2013,
+      "step": 560
+    },
+    {
+      "epoch": 0.39473684210526316,
+      "grad_norm": 0.5873980522155762,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 570
+    },
+    {
+      "epoch": 0.40166204986149584,
+      "grad_norm": 0.5613653659820557,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 580
+    },
+    {
+      "epoch": 0.4085872576177285,
+      "grad_norm": 0.6268449425697327,
+      "learning_rate": 0.0002,
+      "loss": 1.2402,
+      "step": 590
+    },
+    {
+      "epoch": 0.4155124653739612,
+      "grad_norm": 1.1188162565231323,
+      "learning_rate": 0.0002,
+      "loss": 1.1327,
+      "step": 600
+    },
+    {
+      "epoch": 0.4224376731301939,
+      "grad_norm": 0.6280108094215393,
+      "learning_rate": 0.0002,
+      "loss": 1.2013,
+      "step": 610
+    },
+    {
+      "epoch": 0.4293628808864266,
+      "grad_norm": 0.6851654648780823,
+      "learning_rate": 0.0002,
+      "loss": 1.2001,
+      "step": 620
+    },
+    {
+      "epoch": 0.4362880886426593,
+      "grad_norm": 0.7429733872413635,
+      "learning_rate": 0.0002,
+      "loss": 1.3518,
+      "step": 630
+    },
+    {
+      "epoch": 0.44321329639889195,
+      "grad_norm": 0.5793723464012146,
+      "learning_rate": 0.0002,
+      "loss": 1.1977,
+      "step": 640
+    },
+    {
+      "epoch": 0.45013850415512463,
+      "grad_norm": 0.599556565284729,
+      "learning_rate": 0.0002,
+      "loss": 1.0629,
+      "step": 650
+    },
+    {
+      "epoch": 0.45706371191135736,
+      "grad_norm": 0.6003446578979492,
+      "learning_rate": 0.0002,
+      "loss": 1.1855,
+      "step": 660
+    },
+    {
+      "epoch": 0.46398891966759004,
+      "grad_norm": 0.5799188613891602,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 670
+    },
+    {
+      "epoch": 0.4709141274238227,
+      "grad_norm": 0.8198838233947754,
+      "learning_rate": 0.0002,
+      "loss": 1.2998,
+      "step": 680
+    },
+    {
+      "epoch": 0.4778393351800554,
+      "grad_norm": 3.4946129322052,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 690
+    },
+    {
+      "epoch": 0.48476454293628807,
+      "grad_norm": 0.5827652812004089,
+      "learning_rate": 0.0002,
+      "loss": 1.1407,
+      "step": 700
+    },
+    {
+      "epoch": 0.4916897506925208,
+      "grad_norm": 0.7706893682479858,
+      "learning_rate": 0.0002,
+      "loss": 1.1039,
+      "step": 710
+    },
+    {
+      "epoch": 0.4986149584487535,
+      "grad_norm": 0.572982132434845,
+      "learning_rate": 0.0002,
+      "loss": 1.1646,
+      "step": 720
+    },
+    {
+      "epoch": 0.5055401662049861,
+      "grad_norm": 0.6739320755004883,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 730
+    },
+    {
+      "epoch": 0.5124653739612188,
+      "grad_norm": 0.6777870059013367,
+      "learning_rate": 0.0002,
+      "loss": 1.2512,
+      "step": 740
+    },
+    {
+      "epoch": 0.5193905817174516,
+      "grad_norm": 0.6553574204444885,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 750
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 0.6108107566833496,
+      "learning_rate": 0.0002,
+      "loss": 1.1747,
+      "step": 760
+    },
+    {
+      "epoch": 0.5332409972299169,
+      "grad_norm": 0.7425803542137146,
+      "learning_rate": 0.0002,
+      "loss": 1.2656,
+      "step": 770
+    },
+    {
+      "epoch": 0.5401662049861495,
+      "grad_norm": 0.5833735466003418,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 780
+    },
+    {
+      "epoch": 0.5470914127423823,
+      "grad_norm": 0.7790789604187012,
+      "learning_rate": 0.0002,
+      "loss": 1.1986,
+      "step": 790
+    },
+    {
+      "epoch": 0.554016620498615,
+      "grad_norm": 0.63566654920578,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 800
+    },
+    {
+      "epoch": 0.5609418282548476,
+      "grad_norm": 0.848599374294281,
+      "learning_rate": 0.0002,
+      "loss": 1.2435,
+      "step": 810
+    },
+    {
+      "epoch": 0.5678670360110804,
+      "grad_norm": 0.5431618094444275,
+      "learning_rate": 0.0002,
+      "loss": 1.1765,
+      "step": 820
+    },
+    {
+      "epoch": 0.574792243767313,
+      "grad_norm": 0.9087665677070618,
+      "learning_rate": 0.0002,
+      "loss": 1.1557,
+      "step": 830
+    },
+    {
+      "epoch": 0.5817174515235457,
+      "grad_norm": 0.7769272327423096,
+      "learning_rate": 0.0002,
+      "loss": 1.1919,
+      "step": 840
+    },
+    {
+      "epoch": 0.5886426592797784,
+      "grad_norm": 0.576531171798706,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 850
+    },
+    {
+      "epoch": 0.5955678670360111,
+      "grad_norm": 0.6455100178718567,
+      "learning_rate": 0.0002,
+      "loss": 1.1222,
+      "step": 860
+    },
+    {
+      "epoch": 0.6024930747922438,
+      "grad_norm": 0.7983489632606506,
+      "learning_rate": 0.0002,
+      "loss": 1.1871,
+      "step": 870
+    },
+    {
+      "epoch": 0.6094182825484764,
+      "grad_norm": 0.5942565202713013,
+      "learning_rate": 0.0002,
+      "loss": 1.2014,
+      "step": 880
+    },
+    {
+      "epoch": 0.6163434903047091,
+      "grad_norm": 1.1543667316436768,
+      "learning_rate": 0.0002,
+      "loss": 1.1541,
+      "step": 890
+    },
+    {
+      "epoch": 0.6232686980609419,
+      "grad_norm": 0.9434137344360352,
+      "learning_rate": 0.0002,
+      "loss": 1.2133,
+      "step": 900
+    },
+    {
+      "epoch": 0.6301939058171745,
+      "grad_norm": 0.6639302372932434,
+      "learning_rate": 0.0002,
+      "loss": 1.206,
+      "step": 910
+    },
+    {
+      "epoch": 0.6371191135734072,
+      "grad_norm": 0.5908414721488953,
+      "learning_rate": 0.0002,
+      "loss": 1.298,
+      "step": 920
+    },
+    {
+      "epoch": 0.6440443213296398,
+      "grad_norm": 0.8290476202964783,
+      "learning_rate": 0.0002,
+      "loss": 1.217,
+      "step": 930
+    },
+    {
+      "epoch": 0.6509695290858726,
+      "grad_norm": 0.6745959520339966,
+      "learning_rate": 0.0002,
+      "loss": 1.1333,
+      "step": 940
+    },
+    {
+      "epoch": 0.6578947368421053,
+      "grad_norm": 0.6290730834007263,
+      "learning_rate": 0.0002,
+      "loss": 1.2452,
+      "step": 950
+    },
+    {
+      "epoch": 0.6648199445983379,
+      "grad_norm": 0.7656926512718201,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 960
+    },
+    {
+      "epoch": 0.6717451523545707,
+      "grad_norm": 0.5598535537719727,
+      "learning_rate": 0.0002,
+      "loss": 1.0718,
+      "step": 970
+    },
+    {
+      "epoch": 0.6786703601108033,
+      "grad_norm": 0.9047361612319946,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 980
+    },
+    {
+      "epoch": 0.685595567867036,
+      "grad_norm": 0.6446689367294312,
+      "learning_rate": 0.0002,
+      "loss": 1.2108,
+      "step": 990
+    },
+    {
+      "epoch": 0.6925207756232687,
+      "grad_norm": 0.6625745296478271,
+      "learning_rate": 0.0002,
+      "loss": 1.1963,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6994459833795014,
+      "grad_norm": 0.8768957853317261,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7063711911357341,
+      "grad_norm": 0.8657397031784058,
+      "learning_rate": 0.0002,
+      "loss": 1.2188,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7132963988919667,
+      "grad_norm": 0.5494899749755859,
+      "learning_rate": 0.0002,
+      "loss": 1.181,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7202216066481995,
+      "grad_norm": 0.6485587954521179,
+      "learning_rate": 0.0002,
+      "loss": 1.1987,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7271468144044322,
+      "grad_norm": 0.6802186965942383,
+      "learning_rate": 0.0002,
+      "loss": 1.1402,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7340720221606648,
+      "grad_norm": 1.1354074478149414,
+      "learning_rate": 0.0002,
+      "loss": 1.1351,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7409972299168975,
+      "grad_norm": 0.47662121057510376,
+      "learning_rate": 0.0002,
+      "loss": 1.1664,
+      "step": 1070
+    },
+    {
+      "epoch": 0.7479224376731302,
+      "grad_norm": 0.7661218643188477,
+      "learning_rate": 0.0002,
+      "loss": 1.2385,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7548476454293629,
+      "grad_norm": 0.5716169476509094,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7617728531855956,
+      "grad_norm": 0.7137057185173035,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7686980609418282,
+      "grad_norm": 0.6075615882873535,
+      "learning_rate": 0.0002,
+      "loss": 1.2585,
+      "step": 1110
+    },
+    {
+      "epoch": 0.775623268698061,
+      "grad_norm": 0.7440633773803711,
+      "learning_rate": 0.0002,
+      "loss": 1.2645,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7825484764542936,
+      "grad_norm": 0.6310842037200928,
+      "learning_rate": 0.0002,
+      "loss": 1.1927,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 0.9321758151054382,
+      "learning_rate": 0.0002,
+      "loss": 1.2051,
+      "step": 1140
+    },
+    {
+      "epoch": 0.796398891966759,
+      "grad_norm": 0.5627843141555786,
+      "learning_rate": 0.0002,
+      "loss": 1.1997,
+      "step": 1150
+    },
+    {
+      "epoch": 0.8033240997229917,
+      "grad_norm": 0.6857180595397949,
+      "learning_rate": 0.0002,
+      "loss": 1.0698,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8102493074792244,
+      "grad_norm": 0.7500603795051575,
+      "learning_rate": 0.0002,
+      "loss": 1.1898,
+      "step": 1170
+    },
+    {
+      "epoch": 0.817174515235457,
+      "grad_norm": 0.6745953559875488,
+      "learning_rate": 0.0002,
+      "loss": 1.1778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8240997229916898,
+      "grad_norm": 0.7521778345108032,
+      "learning_rate": 0.0002,
+      "loss": 1.1761,
+      "step": 1190
+    },
+    {
+      "epoch": 0.8310249307479224,
+      "grad_norm": 0.8561633825302124,
+      "learning_rate": 0.0002,
+      "loss": 1.2064,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8379501385041551,
+      "grad_norm": 0.5534922480583191,
+      "learning_rate": 0.0002,
+      "loss": 1.2648,
+      "step": 1210
+    },
+    {
+      "epoch": 0.8448753462603878,
+      "grad_norm": 0.7363062500953674,
+      "learning_rate": 0.0002,
+      "loss": 1.1578,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8518005540166205,
+      "grad_norm": 0.767382800579071,
+      "learning_rate": 0.0002,
+      "loss": 1.1936,
+      "step": 1230
+    },
+    {
+      "epoch": 0.8587257617728532,
+      "grad_norm": 0.574267566204071,
+      "learning_rate": 0.0002,
+      "loss": 1.1598,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8656509695290858,
+      "grad_norm": 0.7507286667823792,
+      "learning_rate": 0.0002,
+      "loss": 1.2896,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8725761772853186,
+      "grad_norm": 0.6347624659538269,
+      "learning_rate": 0.0002,
+      "loss": 1.1711,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8795013850415513,
+      "grad_norm": 0.8539272546768188,
+      "learning_rate": 0.0002,
+      "loss": 1.1623,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8864265927977839,
+      "grad_norm": 0.6943685412406921,
+      "learning_rate": 0.0002,
+      "loss": 1.1031,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8933518005540166,
+      "grad_norm": 0.7794576287269592,
+      "learning_rate": 0.0002,
+      "loss": 1.2052,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9002770083102493,
+      "grad_norm": 0.7914884686470032,
+      "learning_rate": 0.0002,
+      "loss": 1.1464,
+      "step": 1300
+    },
+    {
+      "epoch": 0.907202216066482,
+      "grad_norm": 0.7065926790237427,
+      "learning_rate": 0.0002,
+      "loss": 1.1307,
+      "step": 1310
+    },
+    {
+      "epoch": 0.9141274238227147,
+      "grad_norm": 0.6672294735908508,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9210526315789473,
+      "grad_norm": 0.6708444356918335,
+      "learning_rate": 0.0002,
+      "loss": 1.1147,
+      "step": 1330
+    },
+    {
+      "epoch": 0.9279778393351801,
+      "grad_norm": 1.664172887802124,
+      "learning_rate": 0.0002,
+      "loss": 1.2664,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9349030470914127,
+      "grad_norm": 0.7257016897201538,
+      "learning_rate": 0.0002,
+      "loss": 1.1772,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9418282548476454,
+      "grad_norm": 0.7262710332870483,
+      "learning_rate": 0.0002,
+      "loss": 1.2025,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9487534626038782,
+      "grad_norm": 0.5953166484832764,
+      "learning_rate": 0.0002,
+      "loss": 1.0711,
+      "step": 1370
+    },
+    {
+      "epoch": 0.9556786703601108,
+      "grad_norm": 0.7025235295295715,
+      "learning_rate": 0.0002,
+      "loss": 1.2246,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9626038781163435,
+      "grad_norm": 1.520357608795166,
+      "learning_rate": 0.0002,
+      "loss": 1.091,
+      "step": 1390
+    },
+    {
+      "epoch": 0.9695290858725761,
+      "grad_norm": 0.6089615821838379,
+      "learning_rate": 0.0002,
+      "loss": 1.1666,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9764542936288089,
+      "grad_norm": 0.6419881582260132,
+      "learning_rate": 0.0002,
+      "loss": 1.2216,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9833795013850416,
+      "grad_norm": 0.8522403836250305,
+      "learning_rate": 0.0002,
+      "loss": 1.1212,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9903047091412742,
+      "grad_norm": 0.9326029419898987,
+      "learning_rate": 0.0002,
+      "loss": 1.0722,
+      "step": 1430
+    },
+    {
+      "epoch": 0.997229916897507,
+      "grad_norm": 0.5441697239875793,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 1440
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.1866850852966309,
+      "eval_runtime": 127.8342,
+      "eval_samples_per_second": 3.567,
+      "eval_steps_per_second": 0.446,
+      "step": 1444
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 11552,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.33551053651968e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38753c24fe64a0515c822cd3ac60a6d32e8f10efb3ee119d169c891f359f9ca8
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c86b8a21be6e05311e63fbffeac1c2bf5c4d45b9a95e9c4201ec11b2894fa609
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8776202f141e60bdf34278cc531fba22837a205fd8c009d7e60c672bcad321d3
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b555d59f19abaae2742149020cb4bfbcefa9f4f3dbd51064e9c85987695a45a3
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:970d1b92ca511d0cbf5e45b7362419d52110f61794d2137a71a067f77f149b19
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2065 @@

+{
+  "best_metric": 1.163743495941162,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2888",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 2888,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006925207756232687,
+      "grad_norm": 1.0920530557632446,
+      "learning_rate": 0.0002,
+      "loss": 1.8528,
+      "step": 10
+    },
+    {
+      "epoch": 0.013850415512465374,
+      "grad_norm": 0.9524772763252258,
+      "learning_rate": 0.0002,
+      "loss": 1.644,
+      "step": 20
+    },
+    {
+      "epoch": 0.02077562326869806,
+      "grad_norm": 1.0657835006713867,
+      "learning_rate": 0.0002,
+      "loss": 1.5746,
+      "step": 30
+    },
+    {
+      "epoch": 0.027700831024930747,
+      "grad_norm": 0.8247288465499878,
+      "learning_rate": 0.0002,
+      "loss": 1.5292,
+      "step": 40
+    },
+    {
+      "epoch": 0.03462603878116344,
+      "grad_norm": 0.7946091890335083,
+      "learning_rate": 0.0002,
+      "loss": 1.4266,
+      "step": 50
+    },
+    {
+      "epoch": 0.04155124653739612,
+      "grad_norm": 0.9081670045852661,
+      "learning_rate": 0.0002,
+      "loss": 1.4128,
+      "step": 60
+    },
+    {
+      "epoch": 0.04847645429362881,
+      "grad_norm": 0.6268877387046814,
+      "learning_rate": 0.0002,
+      "loss": 1.3102,
+      "step": 70
+    },
+    {
+      "epoch": 0.055401662049861494,
+      "grad_norm": 0.7338827252388,
+      "learning_rate": 0.0002,
+      "loss": 1.1669,
+      "step": 80
+    },
+    {
+      "epoch": 0.062326869806094184,
+      "grad_norm": 0.7672784924507141,
+      "learning_rate": 0.0002,
+      "loss": 1.3046,
+      "step": 90
+    },
+    {
+      "epoch": 0.06925207756232687,
+      "grad_norm": 0.6481738090515137,
+      "learning_rate": 0.0002,
+      "loss": 1.2472,
+      "step": 100
+    },
+    {
+      "epoch": 0.07617728531855955,
+      "grad_norm": 0.8287441730499268,
+      "learning_rate": 0.0002,
+      "loss": 1.2622,
+      "step": 110
+    },
+    {
+      "epoch": 0.08310249307479224,
+      "grad_norm": 0.8505423665046692,
+      "learning_rate": 0.0002,
+      "loss": 1.2355,
+      "step": 120
+    },
+    {
+      "epoch": 0.09002770083102493,
+      "grad_norm": 0.7897729873657227,
+      "learning_rate": 0.0002,
+      "loss": 1.2878,
+      "step": 130
+    },
+    {
+      "epoch": 0.09695290858725762,
+      "grad_norm": 0.5853613018989563,
+      "learning_rate": 0.0002,
+      "loss": 1.1961,
+      "step": 140
+    },
+    {
+      "epoch": 0.1038781163434903,
+      "grad_norm": 0.8110899925231934,
+      "learning_rate": 0.0002,
+      "loss": 1.1575,
+      "step": 150
+    },
+    {
+      "epoch": 0.11080332409972299,
+      "grad_norm": 0.7012475728988647,
+      "learning_rate": 0.0002,
+      "loss": 1.2082,
+      "step": 160
+    },
+    {
+      "epoch": 0.11772853185595568,
+      "grad_norm": 0.6915570497512817,
+      "learning_rate": 0.0002,
+      "loss": 1.2404,
+      "step": 170
+    },
+    {
+      "epoch": 0.12465373961218837,
+      "grad_norm": 0.6198431849479675,
+      "learning_rate": 0.0002,
+      "loss": 1.1986,
+      "step": 180
+    },
+    {
+      "epoch": 0.13157894736842105,
+      "grad_norm": 0.6338268518447876,
+      "learning_rate": 0.0002,
+      "loss": 1.2419,
+      "step": 190
+    },
+    {
+      "epoch": 0.13850415512465375,
+      "grad_norm": 0.6599787473678589,
+      "learning_rate": 0.0002,
+      "loss": 1.1719,
+      "step": 200
+    },
+    {
+      "epoch": 0.14542936288088643,
+      "grad_norm": 0.5243592262268066,
+      "learning_rate": 0.0002,
+      "loss": 1.2069,
+      "step": 210
+    },
+    {
+      "epoch": 0.1523545706371191,
+      "grad_norm": 0.7506922483444214,
+      "learning_rate": 0.0002,
+      "loss": 1.2237,
+      "step": 220
+    },
+    {
+      "epoch": 0.1592797783933518,
+      "grad_norm": 0.7365843057632446,
+      "learning_rate": 0.0002,
+      "loss": 1.1774,
+      "step": 230
+    },
+    {
+      "epoch": 0.16620498614958448,
+      "grad_norm": 0.6041411757469177,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 240
+    },
+    {
+      "epoch": 0.1731301939058172,
+      "grad_norm": 0.5634334683418274,
+      "learning_rate": 0.0002,
+      "loss": 1.252,
+      "step": 250
+    },
+    {
+      "epoch": 0.18005540166204986,
+      "grad_norm": 0.5572287440299988,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 260
+    },
+    {
+      "epoch": 0.18698060941828254,
+      "grad_norm": 0.8472719788551331,
+      "learning_rate": 0.0002,
+      "loss": 1.1742,
+      "step": 270
+    },
+    {
+      "epoch": 0.19390581717451524,
+      "grad_norm": 0.6819698810577393,
+      "learning_rate": 0.0002,
+      "loss": 1.1148,
+      "step": 280
+    },
+    {
+      "epoch": 0.20083102493074792,
+      "grad_norm": 0.6842968463897705,
+      "learning_rate": 0.0002,
+      "loss": 1.199,
+      "step": 290
+    },
+    {
+      "epoch": 0.2077562326869806,
+      "grad_norm": 0.5447699427604675,
+      "learning_rate": 0.0002,
+      "loss": 1.1587,
+      "step": 300
+    },
+    {
+      "epoch": 0.2146814404432133,
+      "grad_norm": 0.5943161845207214,
+      "learning_rate": 0.0002,
+      "loss": 1.2508,
+      "step": 310
+    },
+    {
+      "epoch": 0.22160664819944598,
+      "grad_norm": 1.111795425415039,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 320
+    },
+    {
+      "epoch": 0.22853185595567868,
+      "grad_norm": 0.6633310317993164,
+      "learning_rate": 0.0002,
+      "loss": 1.1938,
+      "step": 330
+    },
+    {
+      "epoch": 0.23545706371191136,
+      "grad_norm": 2.072023868560791,
+      "learning_rate": 0.0002,
+      "loss": 1.2189,
+      "step": 340
+    },
+    {
+      "epoch": 0.24238227146814403,
+      "grad_norm": 0.784428060054779,
+      "learning_rate": 0.0002,
+      "loss": 1.2753,
+      "step": 350
+    },
+    {
+      "epoch": 0.24930747922437674,
+      "grad_norm": 0.5437638759613037,
+      "learning_rate": 0.0002,
+      "loss": 1.2269,
+      "step": 360
+    },
+    {
+      "epoch": 0.2562326869806094,
+      "grad_norm": 0.4859085977077484,
+      "learning_rate": 0.0002,
+      "loss": 1.1403,
+      "step": 370
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 1.659372091293335,
+      "learning_rate": 0.0002,
+      "loss": 1.1949,
+      "step": 380
+    },
+    {
+      "epoch": 0.27008310249307477,
+      "grad_norm": 0.7998494505882263,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 390
+    },
+    {
+      "epoch": 0.2770083102493075,
+      "grad_norm": 0.6909512877464294,
+      "learning_rate": 0.0002,
+      "loss": 1.2495,
+      "step": 400
+    },
+    {
+      "epoch": 0.2839335180055402,
+      "grad_norm": 0.7771576642990112,
+      "learning_rate": 0.0002,
+      "loss": 1.1665,
+      "step": 410
+    },
+    {
+      "epoch": 0.29085872576177285,
+      "grad_norm": 0.5854765176773071,
+      "learning_rate": 0.0002,
+      "loss": 1.2721,
+      "step": 420
+    },
+    {
+      "epoch": 0.29778393351800553,
+      "grad_norm": 0.5054845213890076,
+      "learning_rate": 0.0002,
+      "loss": 1.2623,
+      "step": 430
+    },
+    {
+      "epoch": 0.3047091412742382,
+      "grad_norm": 0.4726078510284424,
+      "learning_rate": 0.0002,
+      "loss": 1.2714,
+      "step": 440
+    },
+    {
+      "epoch": 0.31163434903047094,
+      "grad_norm": 0.8064747452735901,
+      "learning_rate": 0.0002,
+      "loss": 1.2929,
+      "step": 450
+    },
+    {
+      "epoch": 0.3185595567867036,
+      "grad_norm": 1.1799591779708862,
+      "learning_rate": 0.0002,
+      "loss": 1.2516,
+      "step": 460
+    },
+    {
+      "epoch": 0.3254847645429363,
+      "grad_norm": 0.580531656742096,
+      "learning_rate": 0.0002,
+      "loss": 1.137,
+      "step": 470
+    },
+    {
+      "epoch": 0.33240997229916897,
+      "grad_norm": 0.7213053703308105,
+      "learning_rate": 0.0002,
+      "loss": 1.3027,
+      "step": 480
+    },
+    {
+      "epoch": 0.33933518005540164,
+      "grad_norm": 0.8764265775680542,
+      "learning_rate": 0.0002,
+      "loss": 1.3223,
+      "step": 490
+    },
+    {
+      "epoch": 0.3462603878116344,
+      "grad_norm": 0.539807140827179,
+      "learning_rate": 0.0002,
+      "loss": 1.1672,
+      "step": 500
+    },
+    {
+      "epoch": 0.35318559556786705,
+      "grad_norm": 0.5516729950904846,
+      "learning_rate": 0.0002,
+      "loss": 1.238,
+      "step": 510
+    },
+    {
+      "epoch": 0.3601108033240997,
+      "grad_norm": 0.6443287134170532,
+      "learning_rate": 0.0002,
+      "loss": 1.1304,
+      "step": 520
+    },
+    {
+      "epoch": 0.3670360110803324,
+      "grad_norm": 0.5140393972396851,
+      "learning_rate": 0.0002,
+      "loss": 1.2379,
+      "step": 530
+    },
+    {
+      "epoch": 0.3739612188365651,
+      "grad_norm": 0.4929957389831543,
+      "learning_rate": 0.0002,
+      "loss": 1.2503,
+      "step": 540
+    },
+    {
+      "epoch": 0.3808864265927978,
+      "grad_norm": 0.5621340870857239,
+      "learning_rate": 0.0002,
+      "loss": 1.1855,
+      "step": 550
+    },
+    {
+      "epoch": 0.3878116343490305,
+      "grad_norm": 0.4839057922363281,
+      "learning_rate": 0.0002,
+      "loss": 1.2013,
+      "step": 560
+    },
+    {
+      "epoch": 0.39473684210526316,
+      "grad_norm": 0.5873980522155762,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 570
+    },
+    {
+      "epoch": 0.40166204986149584,
+      "grad_norm": 0.5613653659820557,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 580
+    },
+    {
+      "epoch": 0.4085872576177285,
+      "grad_norm": 0.6268449425697327,
+      "learning_rate": 0.0002,
+      "loss": 1.2402,
+      "step": 590
+    },
+    {
+      "epoch": 0.4155124653739612,
+      "grad_norm": 1.1188162565231323,
+      "learning_rate": 0.0002,
+      "loss": 1.1327,
+      "step": 600
+    },
+    {
+      "epoch": 0.4224376731301939,
+      "grad_norm": 0.6280108094215393,
+      "learning_rate": 0.0002,
+      "loss": 1.2013,
+      "step": 610
+    },
+    {
+      "epoch": 0.4293628808864266,
+      "grad_norm": 0.6851654648780823,
+      "learning_rate": 0.0002,
+      "loss": 1.2001,
+      "step": 620
+    },
+    {
+      "epoch": 0.4362880886426593,
+      "grad_norm": 0.7429733872413635,
+      "learning_rate": 0.0002,
+      "loss": 1.3518,
+      "step": 630
+    },
+    {
+      "epoch": 0.44321329639889195,
+      "grad_norm": 0.5793723464012146,
+      "learning_rate": 0.0002,
+      "loss": 1.1977,
+      "step": 640
+    },
+    {
+      "epoch": 0.45013850415512463,
+      "grad_norm": 0.599556565284729,
+      "learning_rate": 0.0002,
+      "loss": 1.0629,
+      "step": 650
+    },
+    {
+      "epoch": 0.45706371191135736,
+      "grad_norm": 0.6003446578979492,
+      "learning_rate": 0.0002,
+      "loss": 1.1855,
+      "step": 660
+    },
+    {
+      "epoch": 0.46398891966759004,
+      "grad_norm": 0.5799188613891602,
+      "learning_rate": 0.0002,
+      "loss": 1.1336,
+      "step": 670
+    },
+    {
+      "epoch": 0.4709141274238227,
+      "grad_norm": 0.8198838233947754,
+      "learning_rate": 0.0002,
+      "loss": 1.2998,
+      "step": 680
+    },
+    {
+      "epoch": 0.4778393351800554,
+      "grad_norm": 3.4946129322052,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 690
+    },
+    {
+      "epoch": 0.48476454293628807,
+      "grad_norm": 0.5827652812004089,
+      "learning_rate": 0.0002,
+      "loss": 1.1407,
+      "step": 700
+    },
+    {
+      "epoch": 0.4916897506925208,
+      "grad_norm": 0.7706893682479858,
+      "learning_rate": 0.0002,
+      "loss": 1.1039,
+      "step": 710
+    },
+    {
+      "epoch": 0.4986149584487535,
+      "grad_norm": 0.572982132434845,
+      "learning_rate": 0.0002,
+      "loss": 1.1646,
+      "step": 720
+    },
+    {
+      "epoch": 0.5055401662049861,
+      "grad_norm": 0.6739320755004883,
+      "learning_rate": 0.0002,
+      "loss": 1.2215,
+      "step": 730
+    },
+    {
+      "epoch": 0.5124653739612188,
+      "grad_norm": 0.6777870059013367,
+      "learning_rate": 0.0002,
+      "loss": 1.2512,
+      "step": 740
+    },
+    {
+      "epoch": 0.5193905817174516,
+      "grad_norm": 0.6553574204444885,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 750
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 0.6108107566833496,
+      "learning_rate": 0.0002,
+      "loss": 1.1747,
+      "step": 760
+    },
+    {
+      "epoch": 0.5332409972299169,
+      "grad_norm": 0.7425803542137146,
+      "learning_rate": 0.0002,
+      "loss": 1.2656,
+      "step": 770
+    },
+    {
+      "epoch": 0.5401662049861495,
+      "grad_norm": 0.5833735466003418,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 780
+    },
+    {
+      "epoch": 0.5470914127423823,
+      "grad_norm": 0.7790789604187012,
+      "learning_rate": 0.0002,
+      "loss": 1.1986,
+      "step": 790
+    },
+    {
+      "epoch": 0.554016620498615,
+      "grad_norm": 0.63566654920578,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 800
+    },
+    {
+      "epoch": 0.5609418282548476,
+      "grad_norm": 0.848599374294281,
+      "learning_rate": 0.0002,
+      "loss": 1.2435,
+      "step": 810
+    },
+    {
+      "epoch": 0.5678670360110804,
+      "grad_norm": 0.5431618094444275,
+      "learning_rate": 0.0002,
+      "loss": 1.1765,
+      "step": 820
+    },
+    {
+      "epoch": 0.574792243767313,
+      "grad_norm": 0.9087665677070618,
+      "learning_rate": 0.0002,
+      "loss": 1.1557,
+      "step": 830
+    },
+    {
+      "epoch": 0.5817174515235457,
+      "grad_norm": 0.7769272327423096,
+      "learning_rate": 0.0002,
+      "loss": 1.1919,
+      "step": 840
+    },
+    {
+      "epoch": 0.5886426592797784,
+      "grad_norm": 0.576531171798706,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 850
+    },
+    {
+      "epoch": 0.5955678670360111,
+      "grad_norm": 0.6455100178718567,
+      "learning_rate": 0.0002,
+      "loss": 1.1222,
+      "step": 860
+    },
+    {
+      "epoch": 0.6024930747922438,
+      "grad_norm": 0.7983489632606506,
+      "learning_rate": 0.0002,
+      "loss": 1.1871,
+      "step": 870
+    },
+    {
+      "epoch": 0.6094182825484764,
+      "grad_norm": 0.5942565202713013,
+      "learning_rate": 0.0002,
+      "loss": 1.2014,
+      "step": 880
+    },
+    {
+      "epoch": 0.6163434903047091,
+      "grad_norm": 1.1543667316436768,
+      "learning_rate": 0.0002,
+      "loss": 1.1541,
+      "step": 890
+    },
+    {
+      "epoch": 0.6232686980609419,
+      "grad_norm": 0.9434137344360352,
+      "learning_rate": 0.0002,
+      "loss": 1.2133,
+      "step": 900
+    },
+    {
+      "epoch": 0.6301939058171745,
+      "grad_norm": 0.6639302372932434,
+      "learning_rate": 0.0002,
+      "loss": 1.206,
+      "step": 910
+    },
+    {
+      "epoch": 0.6371191135734072,
+      "grad_norm": 0.5908414721488953,
+      "learning_rate": 0.0002,
+      "loss": 1.298,
+      "step": 920
+    },
+    {
+      "epoch": 0.6440443213296398,
+      "grad_norm": 0.8290476202964783,
+      "learning_rate": 0.0002,
+      "loss": 1.217,
+      "step": 930
+    },
+    {
+      "epoch": 0.6509695290858726,
+      "grad_norm": 0.6745959520339966,
+      "learning_rate": 0.0002,
+      "loss": 1.1333,
+      "step": 940
+    },
+    {
+      "epoch": 0.6578947368421053,
+      "grad_norm": 0.6290730834007263,
+      "learning_rate": 0.0002,
+      "loss": 1.2452,
+      "step": 950
+    },
+    {
+      "epoch": 0.6648199445983379,
+      "grad_norm": 0.7656926512718201,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 960
+    },
+    {
+      "epoch": 0.6717451523545707,
+      "grad_norm": 0.5598535537719727,
+      "learning_rate": 0.0002,
+      "loss": 1.0718,
+      "step": 970
+    },
+    {
+      "epoch": 0.6786703601108033,
+      "grad_norm": 0.9047361612319946,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 980
+    },
+    {
+      "epoch": 0.685595567867036,
+      "grad_norm": 0.6446689367294312,
+      "learning_rate": 0.0002,
+      "loss": 1.2108,
+      "step": 990
+    },
+    {
+      "epoch": 0.6925207756232687,
+      "grad_norm": 0.6625745296478271,
+      "learning_rate": 0.0002,
+      "loss": 1.1963,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6994459833795014,
+      "grad_norm": 0.8768957853317261,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7063711911357341,
+      "grad_norm": 0.8657397031784058,
+      "learning_rate": 0.0002,
+      "loss": 1.2188,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7132963988919667,
+      "grad_norm": 0.5494899749755859,
+      "learning_rate": 0.0002,
+      "loss": 1.181,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7202216066481995,
+      "grad_norm": 0.6485587954521179,
+      "learning_rate": 0.0002,
+      "loss": 1.1987,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7271468144044322,
+      "grad_norm": 0.6802186965942383,
+      "learning_rate": 0.0002,
+      "loss": 1.1402,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7340720221606648,
+      "grad_norm": 1.1354074478149414,
+      "learning_rate": 0.0002,
+      "loss": 1.1351,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7409972299168975,
+      "grad_norm": 0.47662121057510376,
+      "learning_rate": 0.0002,
+      "loss": 1.1664,
+      "step": 1070
+    },
+    {
+      "epoch": 0.7479224376731302,
+      "grad_norm": 0.7661218643188477,
+      "learning_rate": 0.0002,
+      "loss": 1.2385,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7548476454293629,
+      "grad_norm": 0.5716169476509094,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7617728531855956,
+      "grad_norm": 0.7137057185173035,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7686980609418282,
+      "grad_norm": 0.6075615882873535,
+      "learning_rate": 0.0002,
+      "loss": 1.2585,
+      "step": 1110
+    },
+    {
+      "epoch": 0.775623268698061,
+      "grad_norm": 0.7440633773803711,
+      "learning_rate": 0.0002,
+      "loss": 1.2645,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7825484764542936,
+      "grad_norm": 0.6310842037200928,
+      "learning_rate": 0.0002,
+      "loss": 1.1927,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 0.9321758151054382,
+      "learning_rate": 0.0002,
+      "loss": 1.2051,
+      "step": 1140
+    },
+    {
+      "epoch": 0.796398891966759,
+      "grad_norm": 0.5627843141555786,
+      "learning_rate": 0.0002,
+      "loss": 1.1997,
+      "step": 1150
+    },
+    {
+      "epoch": 0.8033240997229917,
+      "grad_norm": 0.6857180595397949,
+      "learning_rate": 0.0002,
+      "loss": 1.0698,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8102493074792244,
+      "grad_norm": 0.7500603795051575,
+      "learning_rate": 0.0002,
+      "loss": 1.1898,
+      "step": 1170
+    },
+    {
+      "epoch": 0.817174515235457,
+      "grad_norm": 0.6745953559875488,
+      "learning_rate": 0.0002,
+      "loss": 1.1778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8240997229916898,
+      "grad_norm": 0.7521778345108032,
+      "learning_rate": 0.0002,
+      "loss": 1.1761,
+      "step": 1190
+    },
+    {
+      "epoch": 0.8310249307479224,
+      "grad_norm": 0.8561633825302124,
+      "learning_rate": 0.0002,
+      "loss": 1.2064,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8379501385041551,
+      "grad_norm": 0.5534922480583191,
+      "learning_rate": 0.0002,
+      "loss": 1.2648,
+      "step": 1210
+    },
+    {
+      "epoch": 0.8448753462603878,
+      "grad_norm": 0.7363062500953674,
+      "learning_rate": 0.0002,
+      "loss": 1.1578,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8518005540166205,
+      "grad_norm": 0.767382800579071,
+      "learning_rate": 0.0002,
+      "loss": 1.1936,
+      "step": 1230
+    },
+    {
+      "epoch": 0.8587257617728532,
+      "grad_norm": 0.574267566204071,
+      "learning_rate": 0.0002,
+      "loss": 1.1598,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8656509695290858,
+      "grad_norm": 0.7507286667823792,
+      "learning_rate": 0.0002,
+      "loss": 1.2896,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8725761772853186,
+      "grad_norm": 0.6347624659538269,
+      "learning_rate": 0.0002,
+      "loss": 1.1711,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8795013850415513,
+      "grad_norm": 0.8539272546768188,
+      "learning_rate": 0.0002,
+      "loss": 1.1623,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8864265927977839,
+      "grad_norm": 0.6943685412406921,
+      "learning_rate": 0.0002,
+      "loss": 1.1031,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8933518005540166,
+      "grad_norm": 0.7794576287269592,
+      "learning_rate": 0.0002,
+      "loss": 1.2052,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9002770083102493,
+      "grad_norm": 0.7914884686470032,
+      "learning_rate": 0.0002,
+      "loss": 1.1464,
+      "step": 1300
+    },
+    {
+      "epoch": 0.907202216066482,
+      "grad_norm": 0.7065926790237427,
+      "learning_rate": 0.0002,
+      "loss": 1.1307,
+      "step": 1310
+    },
+    {
+      "epoch": 0.9141274238227147,
+      "grad_norm": 0.6672294735908508,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9210526315789473,
+      "grad_norm": 0.6708444356918335,
+      "learning_rate": 0.0002,
+      "loss": 1.1147,
+      "step": 1330
+    },
+    {
+      "epoch": 0.9279778393351801,
+      "grad_norm": 1.664172887802124,
+      "learning_rate": 0.0002,
+      "loss": 1.2664,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9349030470914127,
+      "grad_norm": 0.7257016897201538,
+      "learning_rate": 0.0002,
+      "loss": 1.1772,
+      "step": 1350
+    },
+    {
+      "epoch": 0.9418282548476454,
+      "grad_norm": 0.7262710332870483,
+      "learning_rate": 0.0002,
+      "loss": 1.2025,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9487534626038782,
+      "grad_norm": 0.5953166484832764,
+      "learning_rate": 0.0002,
+      "loss": 1.0711,
+      "step": 1370
+    },
+    {
+      "epoch": 0.9556786703601108,
+      "grad_norm": 0.7025235295295715,
+      "learning_rate": 0.0002,
+      "loss": 1.2246,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9626038781163435,
+      "grad_norm": 1.520357608795166,
+      "learning_rate": 0.0002,
+      "loss": 1.091,
+      "step": 1390
+    },
+    {
+      "epoch": 0.9695290858725761,
+      "grad_norm": 0.6089615821838379,
+      "learning_rate": 0.0002,
+      "loss": 1.1666,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9764542936288089,
+      "grad_norm": 0.6419881582260132,
+      "learning_rate": 0.0002,
+      "loss": 1.2216,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9833795013850416,
+      "grad_norm": 0.8522403836250305,
+      "learning_rate": 0.0002,
+      "loss": 1.1212,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9903047091412742,
+      "grad_norm": 0.9326029419898987,
+      "learning_rate": 0.0002,
+      "loss": 1.0722,
+      "step": 1430
+    },
+    {
+      "epoch": 0.997229916897507,
+      "grad_norm": 0.5441697239875793,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 1440
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.1866850852966309,
+      "eval_runtime": 127.8342,
+      "eval_samples_per_second": 3.567,
+      "eval_steps_per_second": 0.446,
+      "step": 1444
+    },
+    {
+      "epoch": 1.0041551246537397,
+      "grad_norm": 0.7892783880233765,
+      "learning_rate": 0.0002,
+      "loss": 1.1176,
+      "step": 1450
+    },
+    {
+      "epoch": 1.0110803324099722,
+      "grad_norm": 0.7309587597846985,
+      "learning_rate": 0.0002,
+      "loss": 0.9706,
+      "step": 1460
+    },
+    {
+      "epoch": 1.018005540166205,
+      "grad_norm": 0.6690002679824829,
+      "learning_rate": 0.0002,
+      "loss": 1.0024,
+      "step": 1470
+    },
+    {
+      "epoch": 1.0249307479224377,
+      "grad_norm": 0.7056353092193604,
+      "learning_rate": 0.0002,
+      "loss": 1.0554,
+      "step": 1480
+    },
+    {
+      "epoch": 1.0318559556786704,
+      "grad_norm": 1.1764522790908813,
+      "learning_rate": 0.0002,
+      "loss": 0.9452,
+      "step": 1490
+    },
+    {
+      "epoch": 1.0387811634349031,
+      "grad_norm": 0.8229345679283142,
+      "learning_rate": 0.0002,
+      "loss": 1.0361,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0457063711911356,
+      "grad_norm": 0.8496960997581482,
+      "learning_rate": 0.0002,
+      "loss": 1.0186,
+      "step": 1510
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 1.1805299520492554,
+      "learning_rate": 0.0002,
+      "loss": 0.9889,
+      "step": 1520
+    },
+    {
+      "epoch": 1.059556786703601,
+      "grad_norm": 0.7064462900161743,
+      "learning_rate": 0.0002,
+      "loss": 1.0824,
+      "step": 1530
+    },
+    {
+      "epoch": 1.0664819944598338,
+      "grad_norm": 0.61557936668396,
+      "learning_rate": 0.0002,
+      "loss": 1.0742,
+      "step": 1540
+    },
+    {
+      "epoch": 1.0734072022160666,
+      "grad_norm": 0.7568621635437012,
+      "learning_rate": 0.0002,
+      "loss": 1.0466,
+      "step": 1550
+    },
+    {
+      "epoch": 1.080332409972299,
+      "grad_norm": 0.7227526903152466,
+      "learning_rate": 0.0002,
+      "loss": 0.9892,
+      "step": 1560
+    },
+    {
+      "epoch": 1.0872576177285318,
+      "grad_norm": 0.7698923349380493,
+      "learning_rate": 0.0002,
+      "loss": 1.0327,
+      "step": 1570
+    },
+    {
+      "epoch": 1.0941828254847645,
+      "grad_norm": 0.9773432016372681,
+      "learning_rate": 0.0002,
+      "loss": 1.0508,
+      "step": 1580
+    },
+    {
+      "epoch": 1.1011080332409973,
+      "grad_norm": 0.6344825029373169,
+      "learning_rate": 0.0002,
+      "loss": 1.0394,
+      "step": 1590
+    },
+    {
+      "epoch": 1.10803324099723,
+      "grad_norm": 0.7506313920021057,
+      "learning_rate": 0.0002,
+      "loss": 1.0448,
+      "step": 1600
+    },
+    {
+      "epoch": 1.1149584487534625,
+      "grad_norm": 0.6188986301422119,
+      "learning_rate": 0.0002,
+      "loss": 1.0789,
+      "step": 1610
+    },
+    {
+      "epoch": 1.1218836565096952,
+      "grad_norm": 0.7686996459960938,
+      "learning_rate": 0.0002,
+      "loss": 1.1199,
+      "step": 1620
+    },
+    {
+      "epoch": 1.128808864265928,
+      "grad_norm": 0.6722452640533447,
+      "learning_rate": 0.0002,
+      "loss": 1.1007,
+      "step": 1630
+    },
+    {
+      "epoch": 1.1357340720221607,
+      "grad_norm": 0.7810907959938049,
+      "learning_rate": 0.0002,
+      "loss": 1.0181,
+      "step": 1640
+    },
+    {
+      "epoch": 1.1426592797783934,
+      "grad_norm": 0.737457811832428,
+      "learning_rate": 0.0002,
+      "loss": 1.0127,
+      "step": 1650
+    },
+    {
+      "epoch": 1.149584487534626,
+      "grad_norm": 0.6397888660430908,
+      "learning_rate": 0.0002,
+      "loss": 1.0938,
+      "step": 1660
+    },
+    {
+      "epoch": 1.1565096952908587,
+      "grad_norm": 1.399459958076477,
+      "learning_rate": 0.0002,
+      "loss": 1.1539,
+      "step": 1670
+    },
+    {
+      "epoch": 1.1634349030470914,
+      "grad_norm": 0.6583362817764282,
+      "learning_rate": 0.0002,
+      "loss": 1.0687,
+      "step": 1680
+    },
+    {
+      "epoch": 1.1703601108033241,
+      "grad_norm": 0.7830096483230591,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 1690
+    },
+    {
+      "epoch": 1.1772853185595569,
+      "grad_norm": 0.8694793581962585,
+      "learning_rate": 0.0002,
+      "loss": 1.0201,
+      "step": 1700
+    },
+    {
+      "epoch": 1.1842105263157894,
+      "grad_norm": 0.9356638193130493,
+      "learning_rate": 0.0002,
+      "loss": 0.9582,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1911357340720221,
+      "grad_norm": 0.7463025450706482,
+      "learning_rate": 0.0002,
+      "loss": 1.1172,
+      "step": 1720
+    },
+    {
+      "epoch": 1.1980609418282548,
+      "grad_norm": 1.1365296840667725,
+      "learning_rate": 0.0002,
+      "loss": 1.136,
+      "step": 1730
+    },
+    {
+      "epoch": 1.2049861495844876,
+      "grad_norm": 0.7806211113929749,
+      "learning_rate": 0.0002,
+      "loss": 1.0514,
+      "step": 1740
+    },
+    {
+      "epoch": 1.2119113573407203,
+      "grad_norm": 0.7092649340629578,
+      "learning_rate": 0.0002,
+      "loss": 1.0937,
+      "step": 1750
+    },
+    {
+      "epoch": 1.2188365650969528,
+      "grad_norm": 0.7950040698051453,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 1760
+    },
+    {
+      "epoch": 1.2257617728531855,
+      "grad_norm": 0.7956721782684326,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 1770
+    },
+    {
+      "epoch": 1.2326869806094183,
+      "grad_norm": 1.2559596300125122,
+      "learning_rate": 0.0002,
+      "loss": 1.0803,
+      "step": 1780
+    },
+    {
+      "epoch": 1.239612188365651,
+      "grad_norm": 0.7149469256401062,
+      "learning_rate": 0.0002,
+      "loss": 0.9808,
+      "step": 1790
+    },
+    {
+      "epoch": 1.2465373961218837,
+      "grad_norm": 1.1050105094909668,
+      "learning_rate": 0.0002,
+      "loss": 1.092,
+      "step": 1800
+    },
+    {
+      "epoch": 1.2534626038781163,
+      "grad_norm": 0.654937744140625,
+      "learning_rate": 0.0002,
+      "loss": 1.0495,
+      "step": 1810
+    },
+    {
+      "epoch": 1.260387811634349,
+      "grad_norm": 0.8587106466293335,
+      "learning_rate": 0.0002,
+      "loss": 1.1174,
+      "step": 1820
+    },
+    {
+      "epoch": 1.2673130193905817,
+      "grad_norm": 0.7676810026168823,
+      "learning_rate": 0.0002,
+      "loss": 1.0731,
+      "step": 1830
+    },
+    {
+      "epoch": 1.2742382271468145,
+      "grad_norm": 1.0124865770339966,
+      "learning_rate": 0.0002,
+      "loss": 1.1974,
+      "step": 1840
+    },
+    {
+      "epoch": 1.2811634349030472,
+      "grad_norm": 1.2307246923446655,
+      "learning_rate": 0.0002,
+      "loss": 1.1141,
+      "step": 1850
+    },
+    {
+      "epoch": 1.2880886426592797,
+      "grad_norm": 8.2122163772583,
+      "learning_rate": 0.0002,
+      "loss": 1.0616,
+      "step": 1860
+    },
+    {
+      "epoch": 1.2950138504155124,
+      "grad_norm": 0.760845422744751,
+      "learning_rate": 0.0002,
+      "loss": 1.0325,
+      "step": 1870
+    },
+    {
+      "epoch": 1.3019390581717452,
+      "grad_norm": 1.0610498189926147,
+      "learning_rate": 0.0002,
+      "loss": 1.0308,
+      "step": 1880
+    },
+    {
+      "epoch": 1.3088642659279779,
+      "grad_norm": 0.6851376891136169,
+      "learning_rate": 0.0002,
+      "loss": 1.0089,
+      "step": 1890
+    },
+    {
+      "epoch": 1.3157894736842106,
+      "grad_norm": 0.6386473774909973,
+      "learning_rate": 0.0002,
+      "loss": 1.0095,
+      "step": 1900
+    },
+    {
+      "epoch": 1.3227146814404431,
+      "grad_norm": 0.7173354029655457,
+      "learning_rate": 0.0002,
+      "loss": 1.0361,
+      "step": 1910
+    },
+    {
+      "epoch": 1.3296398891966759,
+      "grad_norm": 0.7832159996032715,
+      "learning_rate": 0.0002,
+      "loss": 0.9943,
+      "step": 1920
+    },
+    {
+      "epoch": 1.3365650969529086,
+      "grad_norm": 0.9489508867263794,
+      "learning_rate": 0.0002,
+      "loss": 0.977,
+      "step": 1930
+    },
+    {
+      "epoch": 1.3434903047091413,
+      "grad_norm": 1.257300853729248,
+      "learning_rate": 0.0002,
+      "loss": 1.1107,
+      "step": 1940
+    },
+    {
+      "epoch": 1.350415512465374,
+      "grad_norm": 0.7611873745918274,
+      "learning_rate": 0.0002,
+      "loss": 1.0002,
+      "step": 1950
+    },
+    {
+      "epoch": 1.3573407202216066,
+      "grad_norm": 0.8892331719398499,
+      "learning_rate": 0.0002,
+      "loss": 1.0792,
+      "step": 1960
+    },
+    {
+      "epoch": 1.3642659279778393,
+      "grad_norm": 0.9791921377182007,
+      "learning_rate": 0.0002,
+      "loss": 0.99,
+      "step": 1970
+    },
+    {
+      "epoch": 1.371191135734072,
+      "grad_norm": 0.7446845173835754,
+      "learning_rate": 0.0002,
+      "loss": 1.0726,
+      "step": 1980
+    },
+    {
+      "epoch": 1.3781163434903048,
+      "grad_norm": 1.2640385627746582,
+      "learning_rate": 0.0002,
+      "loss": 1.1053,
+      "step": 1990
+    },
+    {
+      "epoch": 1.3850415512465375,
+      "grad_norm": 0.916458785533905,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 2000
+    },
+    {
+      "epoch": 1.39196675900277,
+      "grad_norm": 0.5745769143104553,
+      "learning_rate": 0.0002,
+      "loss": 1.0265,
+      "step": 2010
+    },
+    {
+      "epoch": 1.3988919667590027,
+      "grad_norm": 0.8850314021110535,
+      "learning_rate": 0.0002,
+      "loss": 1.0098,
+      "step": 2020
+    },
+    {
+      "epoch": 1.4058171745152355,
+      "grad_norm": 0.8868108987808228,
+      "learning_rate": 0.0002,
+      "loss": 1.0288,
+      "step": 2030
+    },
+    {
+      "epoch": 1.4127423822714682,
+      "grad_norm": 0.9108741283416748,
+      "learning_rate": 0.0002,
+      "loss": 1.0299,
+      "step": 2040
+    },
+    {
+      "epoch": 1.419667590027701,
+      "grad_norm": 0.6422314643859863,
+      "learning_rate": 0.0002,
+      "loss": 1.0936,
+      "step": 2050
+    },
+    {
+      "epoch": 1.4265927977839334,
+      "grad_norm": 0.7537696957588196,
+      "learning_rate": 0.0002,
+      "loss": 1.1082,
+      "step": 2060
+    },
+    {
+      "epoch": 1.4335180055401662,
+      "grad_norm": 0.8336232900619507,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 2070
+    },
+    {
+      "epoch": 1.440443213296399,
+      "grad_norm": 0.8482881188392639,
+      "learning_rate": 0.0002,
+      "loss": 1.032,
+      "step": 2080
+    },
+    {
+      "epoch": 1.4473684210526316,
+      "grad_norm": 0.8688116669654846,
+      "learning_rate": 0.0002,
+      "loss": 0.9956,
+      "step": 2090
+    },
+    {
+      "epoch": 1.4542936288088644,
+      "grad_norm": 1.072304368019104,
+      "learning_rate": 0.0002,
+      "loss": 0.9161,
+      "step": 2100
+    },
+    {
+      "epoch": 1.4612188365650969,
+      "grad_norm": 0.8581675887107849,
+      "learning_rate": 0.0002,
+      "loss": 0.9512,
+      "step": 2110
+    },
+    {
+      "epoch": 1.4681440443213296,
+      "grad_norm": 0.832931637763977,
+      "learning_rate": 0.0002,
+      "loss": 1.0727,
+      "step": 2120
+    },
+    {
+      "epoch": 1.4750692520775623,
+      "grad_norm": 1.1179111003875732,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 2130
+    },
+    {
+      "epoch": 1.481994459833795,
+      "grad_norm": 0.8789092898368835,
+      "learning_rate": 0.0002,
+      "loss": 1.1072,
+      "step": 2140
+    },
+    {
+      "epoch": 1.4889196675900278,
+      "grad_norm": 0.6333845853805542,
+      "learning_rate": 0.0002,
+      "loss": 1.0156,
+      "step": 2150
+    },
+    {
+      "epoch": 1.4958448753462603,
+      "grad_norm": 1.2314733266830444,
+      "learning_rate": 0.0002,
+      "loss": 1.0529,
+      "step": 2160
+    },
+    {
+      "epoch": 1.502770083102493,
+      "grad_norm": 1.0805437564849854,
+      "learning_rate": 0.0002,
+      "loss": 1.0035,
+      "step": 2170
+    },
+    {
+      "epoch": 1.5096952908587258,
+      "grad_norm": 1.8751327991485596,
+      "learning_rate": 0.0002,
+      "loss": 1.0761,
+      "step": 2180
+    },
+    {
+      "epoch": 1.5166204986149583,
+      "grad_norm": 0.9077108502388,
+      "learning_rate": 0.0002,
+      "loss": 1.0014,
+      "step": 2190
+    },
+    {
+      "epoch": 1.5235457063711912,
+      "grad_norm": 0.9384046792984009,
+      "learning_rate": 0.0002,
+      "loss": 0.9302,
+      "step": 2200
+    },
+    {
+      "epoch": 1.5304709141274238,
+      "grad_norm": 1.4643501043319702,
+      "learning_rate": 0.0002,
+      "loss": 0.9929,
+      "step": 2210
+    },
+    {
+      "epoch": 1.5373961218836565,
+      "grad_norm": 1.143983244895935,
+      "learning_rate": 0.0002,
+      "loss": 1.034,
+      "step": 2220
+    },
+    {
+      "epoch": 1.5443213296398892,
+      "grad_norm": 1.4147950410842896,
+      "learning_rate": 0.0002,
+      "loss": 1.0875,
+      "step": 2230
+    },
+    {
+      "epoch": 1.5512465373961217,
+      "grad_norm": 0.9932852983474731,
+      "learning_rate": 0.0002,
+      "loss": 1.0433,
+      "step": 2240
+    },
+    {
+      "epoch": 1.5581717451523547,
+      "grad_norm": 0.7871463894844055,
+      "learning_rate": 0.0002,
+      "loss": 1.0386,
+      "step": 2250
+    },
+    {
+      "epoch": 1.5650969529085872,
+      "grad_norm": 0.8919233083724976,
+      "learning_rate": 0.0002,
+      "loss": 1.0358,
+      "step": 2260
+    },
+    {
+      "epoch": 1.57202216066482,
+      "grad_norm": 1.2963417768478394,
+      "learning_rate": 0.0002,
+      "loss": 1.0415,
+      "step": 2270
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 0.8413827419281006,
+      "learning_rate": 0.0002,
+      "loss": 0.9164,
+      "step": 2280
+    },
+    {
+      "epoch": 1.5858725761772852,
+      "grad_norm": 0.8697502613067627,
+      "learning_rate": 0.0002,
+      "loss": 0.9761,
+      "step": 2290
+    },
+    {
+      "epoch": 1.5927977839335181,
+      "grad_norm": 1.077934741973877,
+      "learning_rate": 0.0002,
+      "loss": 1.0459,
+      "step": 2300
+    },
+    {
+      "epoch": 1.5997229916897506,
+      "grad_norm": 1.0339025259017944,
+      "learning_rate": 0.0002,
+      "loss": 1.0709,
+      "step": 2310
+    },
+    {
+      "epoch": 1.6066481994459834,
+      "grad_norm": 1.0506618022918701,
+      "learning_rate": 0.0002,
+      "loss": 0.9864,
+      "step": 2320
+    },
+    {
+      "epoch": 1.613573407202216,
+      "grad_norm": 1.0697245597839355,
+      "learning_rate": 0.0002,
+      "loss": 1.0396,
+      "step": 2330
+    },
+    {
+      "epoch": 1.6204986149584486,
+      "grad_norm": 0.8745642900466919,
+      "learning_rate": 0.0002,
+      "loss": 0.9847,
+      "step": 2340
+    },
+    {
+      "epoch": 1.6274238227146816,
+      "grad_norm": 0.8144440054893494,
+      "learning_rate": 0.0002,
+      "loss": 1.1415,
+      "step": 2350
+    },
+    {
+      "epoch": 1.634349030470914,
+      "grad_norm": 1.444962739944458,
+      "learning_rate": 0.0002,
+      "loss": 1.0429,
+      "step": 2360
+    },
+    {
+      "epoch": 1.6412742382271468,
+      "grad_norm": 0.9747722744941711,
+      "learning_rate": 0.0002,
+      "loss": 1.0781,
+      "step": 2370
+    },
+    {
+      "epoch": 1.6481994459833795,
+      "grad_norm": 1.0384336709976196,
+      "learning_rate": 0.0002,
+      "loss": 1.1296,
+      "step": 2380
+    },
+    {
+      "epoch": 1.655124653739612,
+      "grad_norm": 0.8437462449073792,
+      "learning_rate": 0.0002,
+      "loss": 1.001,
+      "step": 2390
+    },
+    {
+      "epoch": 1.662049861495845,
+      "grad_norm": 0.9484480023384094,
+      "learning_rate": 0.0002,
+      "loss": 1.0744,
+      "step": 2400
+    },
+    {
+      "epoch": 1.6689750692520775,
+      "grad_norm": 1.1232330799102783,
+      "learning_rate": 0.0002,
+      "loss": 1.0879,
+      "step": 2410
+    },
+    {
+      "epoch": 1.6759002770083102,
+      "grad_norm": 1.3238739967346191,
+      "learning_rate": 0.0002,
+      "loss": 1.0989,
+      "step": 2420
+    },
+    {
+      "epoch": 1.682825484764543,
+      "grad_norm": 0.9080462455749512,
+      "learning_rate": 0.0002,
+      "loss": 1.1112,
+      "step": 2430
+    },
+    {
+      "epoch": 1.6897506925207755,
+      "grad_norm": 0.7368793487548828,
+      "learning_rate": 0.0002,
+      "loss": 0.9613,
+      "step": 2440
+    },
+    {
+      "epoch": 1.6966759002770084,
+      "grad_norm": 0.8097803592681885,
+      "learning_rate": 0.0002,
+      "loss": 1.0054,
+      "step": 2450
+    },
+    {
+      "epoch": 1.703601108033241,
+      "grad_norm": 1.1740880012512207,
+      "learning_rate": 0.0002,
+      "loss": 0.9131,
+      "step": 2460
+    },
+    {
+      "epoch": 1.7105263157894737,
+      "grad_norm": 0.9115633368492126,
+      "learning_rate": 0.0002,
+      "loss": 1.0208,
+      "step": 2470
+    },
+    {
+      "epoch": 1.7174515235457064,
+      "grad_norm": 1.2005901336669922,
+      "learning_rate": 0.0002,
+      "loss": 1.0717,
+      "step": 2480
+    },
+    {
+      "epoch": 1.724376731301939,
+      "grad_norm": 0.7378991842269897,
+      "learning_rate": 0.0002,
+      "loss": 1.0219,
+      "step": 2490
+    },
+    {
+      "epoch": 1.7313019390581719,
+      "grad_norm": 1.2069252729415894,
+      "learning_rate": 0.0002,
+      "loss": 0.9961,
+      "step": 2500
+    },
+    {
+      "epoch": 1.7382271468144044,
+      "grad_norm": 0.9264360070228577,
+      "learning_rate": 0.0002,
+      "loss": 1.0031,
+      "step": 2510
+    },
+    {
+      "epoch": 1.745152354570637,
+      "grad_norm": 1.2837986946105957,
+      "learning_rate": 0.0002,
+      "loss": 1.0955,
+      "step": 2520
+    },
+    {
+      "epoch": 1.7520775623268698,
+      "grad_norm": 0.8551320433616638,
+      "learning_rate": 0.0002,
+      "loss": 1.1124,
+      "step": 2530
+    },
+    {
+      "epoch": 1.7590027700831024,
+      "grad_norm": 1.1680896282196045,
+      "learning_rate": 0.0002,
+      "loss": 0.9931,
+      "step": 2540
+    },
+    {
+      "epoch": 1.7659279778393353,
+      "grad_norm": 0.8064085841178894,
+      "learning_rate": 0.0002,
+      "loss": 1.0235,
+      "step": 2550
+    },
+    {
+      "epoch": 1.7728531855955678,
+      "grad_norm": 1.0634359121322632,
+      "learning_rate": 0.0002,
+      "loss": 1.0987,
+      "step": 2560
+    },
+    {
+      "epoch": 1.7797783933518005,
+      "grad_norm": 1.2135919332504272,
+      "learning_rate": 0.0002,
+      "loss": 1.0651,
+      "step": 2570
+    },
+    {
+      "epoch": 1.7867036011080333,
+      "grad_norm": 0.7888956665992737,
+      "learning_rate": 0.0002,
+      "loss": 0.9905,
+      "step": 2580
+    },
+    {
+      "epoch": 1.7936288088642658,
+      "grad_norm": 0.8857567310333252,
+      "learning_rate": 0.0002,
+      "loss": 1.0877,
+      "step": 2590
+    },
+    {
+      "epoch": 1.8005540166204987,
+      "grad_norm": 0.8200850486755371,
+      "learning_rate": 0.0002,
+      "loss": 1.0503,
+      "step": 2600
+    },
+    {
+      "epoch": 1.8074792243767313,
+      "grad_norm": 1.3037220239639282,
+      "learning_rate": 0.0002,
+      "loss": 1.0455,
+      "step": 2610
+    },
+    {
+      "epoch": 1.814404432132964,
+      "grad_norm": 0.9380358457565308,
+      "learning_rate": 0.0002,
+      "loss": 1.0917,
+      "step": 2620
+    },
+    {
+      "epoch": 1.8213296398891967,
+      "grad_norm": 0.7629815936088562,
+      "learning_rate": 0.0002,
+      "loss": 1.0748,
+      "step": 2630
+    },
+    {
+      "epoch": 1.8282548476454292,
+      "grad_norm": 0.9144530296325684,
+      "learning_rate": 0.0002,
+      "loss": 1.0523,
+      "step": 2640
+    },
+    {
+      "epoch": 1.8351800554016622,
+      "grad_norm": 0.7524030804634094,
+      "learning_rate": 0.0002,
+      "loss": 0.9818,
+      "step": 2650
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 0.9500479698181152,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 2660
+    },
+    {
+      "epoch": 1.8490304709141274,
+      "grad_norm": 0.8645035028457642,
+      "learning_rate": 0.0002,
+      "loss": 1.0856,
+      "step": 2670
+    },
+    {
+      "epoch": 1.8559556786703602,
+      "grad_norm": 1.092092514038086,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 2680
+    },
+    {
+      "epoch": 1.8628808864265927,
+      "grad_norm": 0.7123227119445801,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 2690
+    },
+    {
+      "epoch": 1.8698060941828256,
+      "grad_norm": 0.9327989220619202,
+      "learning_rate": 0.0002,
+      "loss": 1.0773,
+      "step": 2700
+    },
+    {
+      "epoch": 1.8767313019390581,
+      "grad_norm": 0.7310019731521606,
+      "learning_rate": 0.0002,
+      "loss": 1.0766,
+      "step": 2710
+    },
+    {
+      "epoch": 1.8836565096952909,
+      "grad_norm": 0.7867451310157776,
+      "learning_rate": 0.0002,
+      "loss": 1.0225,
+      "step": 2720
+    },
+    {
+      "epoch": 1.8905817174515236,
+      "grad_norm": 1.1195571422576904,
+      "learning_rate": 0.0002,
+      "loss": 1.07,
+      "step": 2730
+    },
+    {
+      "epoch": 1.897506925207756,
+      "grad_norm": 0.9449717998504639,
+      "learning_rate": 0.0002,
+      "loss": 0.9448,
+      "step": 2740
+    },
+    {
+      "epoch": 1.904432132963989,
+      "grad_norm": 0.729250431060791,
+      "learning_rate": 0.0002,
+      "loss": 1.0431,
+      "step": 2750
+    },
+    {
+      "epoch": 1.9113573407202216,
+      "grad_norm": 1.400395154953003,
+      "learning_rate": 0.0002,
+      "loss": 1.0117,
+      "step": 2760
+    },
+    {
+      "epoch": 1.9182825484764543,
+      "grad_norm": 0.9791452288627625,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 2770
+    },
+    {
+      "epoch": 1.925207756232687,
+      "grad_norm": 0.8694387674331665,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 2780
+    },
+    {
+      "epoch": 1.9321329639889195,
+      "grad_norm": 1.6347402334213257,
+      "learning_rate": 0.0002,
+      "loss": 0.9917,
+      "step": 2790
+    },
+    {
+      "epoch": 1.9390581717451525,
+      "grad_norm": 0.9378516674041748,
+      "learning_rate": 0.0002,
+      "loss": 1.0014,
+      "step": 2800
+    },
+    {
+      "epoch": 1.945983379501385,
+      "grad_norm": 0.8114185929298401,
+      "learning_rate": 0.0002,
+      "loss": 1.0186,
+      "step": 2810
+    },
+    {
+      "epoch": 1.9529085872576177,
+      "grad_norm": 0.9170882701873779,
+      "learning_rate": 0.0002,
+      "loss": 1.1371,
+      "step": 2820
+    },
+    {
+      "epoch": 1.9598337950138505,
+      "grad_norm": 0.9626749753952026,
+      "learning_rate": 0.0002,
+      "loss": 0.9889,
+      "step": 2830
+    },
+    {
+      "epoch": 1.966759002770083,
+      "grad_norm": 0.9708928465843201,
+      "learning_rate": 0.0002,
+      "loss": 0.9683,
+      "step": 2840
+    },
+    {
+      "epoch": 1.973684210526316,
+      "grad_norm": 0.9437230229377747,
+      "learning_rate": 0.0002,
+      "loss": 1.0545,
+      "step": 2850
+    },
+    {
+      "epoch": 1.9806094182825484,
+      "grad_norm": 1.203179121017456,
+      "learning_rate": 0.0002,
+      "loss": 1.0495,
+      "step": 2860
+    },
+    {
+      "epoch": 1.9875346260387812,
+      "grad_norm": 0.7990216016769409,
+      "learning_rate": 0.0002,
+      "loss": 1.0167,
+      "step": 2870
+    },
+    {
+      "epoch": 1.994459833795014,
+      "grad_norm": 1.8473628759384155,
+      "learning_rate": 0.0002,
+      "loss": 1.0261,
+      "step": 2880
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.163743495941162,
+      "eval_runtime": 126.5445,
+      "eval_samples_per_second": 3.603,
+      "eval_steps_per_second": 0.45,
+      "step": 2888
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 11552,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.267102107303936e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}