MilaWang commited on Mar 28, 2025

Commit

8f950b6

verified ·

1 Parent(s): 7176275

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/tokenizer.json +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/trainer_state.json +1246 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/tokenizer.json +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/trainer_state.json +1849 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/tokenizer.json +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/trainer_state.json +2459 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4282/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4282/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4282/adapter_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -3197,3 +3197,12 @@ gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0
 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-735/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-882/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-735/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-882/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2569/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3426/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4282/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5139/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5995/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6848/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8d56f2a45ff8b602f365ecc6f3bba09418fd4078d2257d100ab6b4976a0764
+size 80755416

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8d56f2a45ff8b602f365ecc6f3bba09418fd4078d2257d100ab6b4976a0764
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdc5541a0499bcb3694d72537b6339289dcde888fd456d6ed9c428187ab6c4ae
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07b35210e244e84d5701644c8cd51598f3b8ad05bd13fc963c4bdfe8c0337bdc
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc12d8ae78cf6a5b378c230e9724a16b53310f5389d7717a4b62321543729c19
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
+size 11418266

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

	@@ -0,0 +1,1246 @@

+{
+  "best_metric": 1.1868568658828735,
+  "best_model_checkpoint": "outputs-001/Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1713,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011675423234092236,
+      "grad_norm": 0.26495933532714844,
+      "learning_rate": 0.0002,
+      "loss": 1.5356,
+      "step": 10
+    },
+    {
+      "epoch": 0.023350846468184472,
+      "grad_norm": 0.26226115226745605,
+      "learning_rate": 0.0002,
+      "loss": 1.5736,
+      "step": 20
+    },
+    {
+      "epoch": 0.03502626970227671,
+      "grad_norm": 0.2216806709766388,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 30
+    },
+    {
+      "epoch": 0.046701692936368944,
+      "grad_norm": 0.2804628014564514,
+      "learning_rate": 0.0002,
+      "loss": 1.3899,
+      "step": 40
+    },
+    {
+      "epoch": 0.05837711617046118,
+      "grad_norm": 0.26673951745033264,
+      "learning_rate": 0.0002,
+      "loss": 1.2255,
+      "step": 50
+    },
+    {
+      "epoch": 0.07005253940455342,
+      "grad_norm": 0.22234757244586945,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 60
+    },
+    {
+      "epoch": 0.08172796263864565,
+      "grad_norm": 0.17038528621196747,
+      "learning_rate": 0.0002,
+      "loss": 0.9847,
+      "step": 70
+    },
+    {
+      "epoch": 0.09340338587273789,
+      "grad_norm": 0.22402487695217133,
+      "learning_rate": 0.0002,
+      "loss": 0.9697,
+      "step": 80
+    },
+    {
+      "epoch": 0.10507880910683012,
+      "grad_norm": 0.2240290343761444,
+      "learning_rate": 0.0002,
+      "loss": 1.1175,
+      "step": 90
+    },
+    {
+      "epoch": 0.11675423234092236,
+      "grad_norm": 0.2043554037809372,
+      "learning_rate": 0.0002,
+      "loss": 1.1355,
+      "step": 100
+    },
+    {
+      "epoch": 0.1284296555750146,
+      "grad_norm": 0.20888502895832062,
+      "learning_rate": 0.0002,
+      "loss": 0.9512,
+      "step": 110
+    },
+    {
+      "epoch": 0.14010507880910683,
+      "grad_norm": 0.47382819652557373,
+      "learning_rate": 0.0002,
+      "loss": 1.128,
+      "step": 120
+    },
+    {
+      "epoch": 0.15178050204319907,
+      "grad_norm": 0.184955894947052,
+      "learning_rate": 0.0002,
+      "loss": 1.0121,
+      "step": 130
+    },
+    {
+      "epoch": 0.1634559252772913,
+      "grad_norm": 0.22605721652507782,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 140
+    },
+    {
+      "epoch": 0.17513134851138354,
+      "grad_norm": 0.2902279496192932,
+      "learning_rate": 0.0002,
+      "loss": 1.0642,
+      "step": 150
+    },
+    {
+      "epoch": 0.18680677174547577,
+      "grad_norm": 0.21148967742919922,
+      "learning_rate": 0.0002,
+      "loss": 1.0829,
+      "step": 160
+    },
+    {
+      "epoch": 0.198482194979568,
+      "grad_norm": 0.2443981170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.1092,
+      "step": 170
+    },
+    {
+      "epoch": 0.21015761821366025,
+      "grad_norm": 0.25699228048324585,
+      "learning_rate": 0.0002,
+      "loss": 1.0163,
+      "step": 180
+    },
+    {
+      "epoch": 0.22183304144775248,
+      "grad_norm": 0.2449636310338974,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 190
+    },
+    {
+      "epoch": 0.23350846468184472,
+      "grad_norm": 0.25968459248542786,
+      "learning_rate": 0.0002,
+      "loss": 1.1952,
+      "step": 200
+    },
+    {
+      "epoch": 0.24518388791593695,
+      "grad_norm": 0.17932388186454773,
+      "learning_rate": 0.0002,
+      "loss": 1.0165,
+      "step": 210
+    },
+    {
+      "epoch": 0.2568593111500292,
+      "grad_norm": 0.22084972262382507,
+      "learning_rate": 0.0002,
+      "loss": 1.1445,
+      "step": 220
+    },
+    {
+      "epoch": 0.2685347343841214,
+      "grad_norm": 0.23466071486473083,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 230
+    },
+    {
+      "epoch": 0.28021015761821366,
+      "grad_norm": 0.20127305388450623,
+      "learning_rate": 0.0002,
+      "loss": 1.0947,
+      "step": 240
+    },
+    {
+      "epoch": 0.29188558085230587,
+      "grad_norm": 0.22740179300308228,
+      "learning_rate": 0.0002,
+      "loss": 1.1821,
+      "step": 250
+    },
+    {
+      "epoch": 0.30356100408639813,
+      "grad_norm": 0.23858675360679626,
+      "learning_rate": 0.0002,
+      "loss": 0.9888,
+      "step": 260
+    },
+    {
+      "epoch": 0.31523642732049034,
+      "grad_norm": 0.18527966737747192,
+      "learning_rate": 0.0002,
+      "loss": 1.1404,
+      "step": 270
+    },
+    {
+      "epoch": 0.3269118505545826,
+      "grad_norm": 0.20215417444705963,
+      "learning_rate": 0.0002,
+      "loss": 1.1307,
+      "step": 280
+    },
+    {
+      "epoch": 0.3385872737886748,
+      "grad_norm": 0.17396175861358643,
+      "learning_rate": 0.0002,
+      "loss": 1.1752,
+      "step": 290
+    },
+    {
+      "epoch": 0.3502626970227671,
+      "grad_norm": 0.2083478718996048,
+      "learning_rate": 0.0002,
+      "loss": 1.222,
+      "step": 300
+    },
+    {
+      "epoch": 0.3619381202568593,
+      "grad_norm": 0.26084500551223755,
+      "learning_rate": 0.0002,
+      "loss": 0.9636,
+      "step": 310
+    },
+    {
+      "epoch": 0.37361354349095155,
+      "grad_norm": 0.2090655416250229,
+      "learning_rate": 0.0002,
+      "loss": 1.0461,
+      "step": 320
+    },
+    {
+      "epoch": 0.38528896672504376,
+      "grad_norm": 0.26721376180648804,
+      "learning_rate": 0.0002,
+      "loss": 1.0545,
+      "step": 330
+    },
+    {
+      "epoch": 0.396964389959136,
+      "grad_norm": 0.2001899778842926,
+      "learning_rate": 0.0002,
+      "loss": 0.812,
+      "step": 340
+    },
+    {
+      "epoch": 0.40863981319322823,
+      "grad_norm": 0.2354399561882019,
+      "learning_rate": 0.0002,
+      "loss": 1.0476,
+      "step": 350
+    },
+    {
+      "epoch": 0.4203152364273205,
+      "grad_norm": 0.22031325101852417,
+      "learning_rate": 0.0002,
+      "loss": 1.0466,
+      "step": 360
+    },
+    {
+      "epoch": 0.4319906596614127,
+      "grad_norm": 0.21608088910579681,
+      "learning_rate": 0.0002,
+      "loss": 1.1381,
+      "step": 370
+    },
+    {
+      "epoch": 0.44366608289550497,
+      "grad_norm": 0.2018078863620758,
+      "learning_rate": 0.0002,
+      "loss": 1.0378,
+      "step": 380
+    },
+    {
+      "epoch": 0.4553415061295972,
+      "grad_norm": 0.22110284864902496,
+      "learning_rate": 0.0002,
+      "loss": 0.892,
+      "step": 390
+    },
+    {
+      "epoch": 0.46701692936368944,
+      "grad_norm": 0.23103947937488556,
+      "learning_rate": 0.0002,
+      "loss": 0.966,
+      "step": 400
+    },
+    {
+      "epoch": 0.47869235259778165,
+      "grad_norm": 0.21037138998508453,
+      "learning_rate": 0.0002,
+      "loss": 1.0522,
+      "step": 410
+    },
+    {
+      "epoch": 0.4903677758318739,
+      "grad_norm": 0.18703506886959076,
+      "learning_rate": 0.0002,
+      "loss": 1.0784,
+      "step": 420
+    },
+    {
+      "epoch": 0.5020431990659662,
+      "grad_norm": 0.22972488403320312,
+      "learning_rate": 0.0002,
+      "loss": 1.1794,
+      "step": 430
+    },
+    {
+      "epoch": 0.5137186223000584,
+      "grad_norm": 0.17576873302459717,
+      "learning_rate": 0.0002,
+      "loss": 1.0857,
+      "step": 440
+    },
+    {
+      "epoch": 0.5253940455341506,
+      "grad_norm": 0.42553630471229553,
+      "learning_rate": 0.0002,
+      "loss": 1.2453,
+      "step": 450
+    },
+    {
+      "epoch": 0.5370694687682428,
+      "grad_norm": 0.2631092071533203,
+      "learning_rate": 0.0002,
+      "loss": 1.2011,
+      "step": 460
+    },
+    {
+      "epoch": 0.5487448920023351,
+      "grad_norm": 0.22879736125469208,
+      "learning_rate": 0.0002,
+      "loss": 1.2222,
+      "step": 470
+    },
+    {
+      "epoch": 0.5604203152364273,
+      "grad_norm": 0.1826648712158203,
+      "learning_rate": 0.0002,
+      "loss": 1.3207,
+      "step": 480
+    },
+    {
+      "epoch": 0.5720957384705195,
+      "grad_norm": 0.18885228037834167,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 490
+    },
+    {
+      "epoch": 0.5837711617046117,
+      "grad_norm": 0.17247331142425537,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 500
+    },
+    {
+      "epoch": 0.5954465849387041,
+      "grad_norm": 0.19905146956443787,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 510
+    },
+    {
+      "epoch": 0.6071220081727963,
+      "grad_norm": 0.21799565851688385,
+      "learning_rate": 0.0002,
+      "loss": 0.9839,
+      "step": 520
+    },
+    {
+      "epoch": 0.6187974314068885,
+      "grad_norm": 0.2032463699579239,
+      "learning_rate": 0.0002,
+      "loss": 1.0234,
+      "step": 530
+    },
+    {
+      "epoch": 0.6304728546409807,
+      "grad_norm": 0.14968429505825043,
+      "learning_rate": 0.0002,
+      "loss": 0.9503,
+      "step": 540
+    },
+    {
+      "epoch": 0.642148277875073,
+      "grad_norm": 0.17513799667358398,
+      "learning_rate": 0.0002,
+      "loss": 0.833,
+      "step": 550
+    },
+    {
+      "epoch": 0.6538237011091652,
+      "grad_norm": 0.1893497258424759,
+      "learning_rate": 0.0002,
+      "loss": 1.0586,
+      "step": 560
+    },
+    {
+      "epoch": 0.6654991243432574,
+      "grad_norm": 0.3045499324798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1426,
+      "step": 570
+    },
+    {
+      "epoch": 0.6771745475773496,
+      "grad_norm": 0.21172650158405304,
+      "learning_rate": 0.0002,
+      "loss": 0.9317,
+      "step": 580
+    },
+    {
+      "epoch": 0.688849970811442,
+      "grad_norm": 0.20392045378684998,
+      "learning_rate": 0.0002,
+      "loss": 1.1187,
+      "step": 590
+    },
+    {
+      "epoch": 0.7005253940455342,
+      "grad_norm": 0.17182187736034393,
+      "learning_rate": 0.0002,
+      "loss": 0.9187,
+      "step": 600
+    },
+    {
+      "epoch": 0.7122008172796264,
+      "grad_norm": 0.17221297323703766,
+      "learning_rate": 0.0002,
+      "loss": 0.9988,
+      "step": 610
+    },
+    {
+      "epoch": 0.7238762405137186,
+      "grad_norm": 0.18639299273490906,
+      "learning_rate": 0.0002,
+      "loss": 1.0334,
+      "step": 620
+    },
+    {
+      "epoch": 0.7355516637478109,
+      "grad_norm": 0.16991640627384186,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 630
+    },
+    {
+      "epoch": 0.7472270869819031,
+      "grad_norm": 0.23263484239578247,
+      "learning_rate": 0.0002,
+      "loss": 0.9335,
+      "step": 640
+    },
+    {
+      "epoch": 0.7589025102159953,
+      "grad_norm": 0.16419798135757446,
+      "learning_rate": 0.0002,
+      "loss": 0.9715,
+      "step": 650
+    },
+    {
+      "epoch": 0.7705779334500875,
+      "grad_norm": 0.20663365721702576,
+      "learning_rate": 0.0002,
+      "loss": 1.0119,
+      "step": 660
+    },
+    {
+      "epoch": 0.7822533566841798,
+      "grad_norm": 0.21871459484100342,
+      "learning_rate": 0.0002,
+      "loss": 1.143,
+      "step": 670
+    },
+    {
+      "epoch": 0.793928779918272,
+      "grad_norm": 0.20669031143188477,
+      "learning_rate": 0.0002,
+      "loss": 1.0363,
+      "step": 680
+    },
+    {
+      "epoch": 0.8056042031523643,
+      "grad_norm": 0.1783137321472168,
+      "learning_rate": 0.0002,
+      "loss": 1.0825,
+      "step": 690
+    },
+    {
+      "epoch": 0.8172796263864565,
+      "grad_norm": 0.24621079862117767,
+      "learning_rate": 0.0002,
+      "loss": 1.0002,
+      "step": 700
+    },
+    {
+      "epoch": 0.8289550496205488,
+      "grad_norm": 0.22598953545093536,
+      "learning_rate": 0.0002,
+      "loss": 1.1322,
+      "step": 710
+    },
+    {
+      "epoch": 0.840630472854641,
+      "grad_norm": 0.17925500869750977,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 720
+    },
+    {
+      "epoch": 0.8523058960887332,
+      "grad_norm": 0.25278252363204956,
+      "learning_rate": 0.0002,
+      "loss": 1.0691,
+      "step": 730
+    },
+    {
+      "epoch": 0.8639813193228254,
+      "grad_norm": 0.5249322652816772,
+      "learning_rate": 0.0002,
+      "loss": 1.0791,
+      "step": 740
+    },
+    {
+      "epoch": 0.8756567425569177,
+      "grad_norm": 0.29942265152931213,
+      "learning_rate": 0.0002,
+      "loss": 1.0798,
+      "step": 750
+    },
+    {
+      "epoch": 0.8873321657910099,
+      "grad_norm": 0.2682401239871979,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 760
+    },
+    {
+      "epoch": 0.8990075890251021,
+      "grad_norm": 0.28810951113700867,
+      "learning_rate": 0.0002,
+      "loss": 1.0917,
+      "step": 770
+    },
+    {
+      "epoch": 0.9106830122591943,
+      "grad_norm": 0.24986644089221954,
+      "learning_rate": 0.0002,
+      "loss": 1.0009,
+      "step": 780
+    },
+    {
+      "epoch": 0.9223584354932867,
+      "grad_norm": 0.21351364254951477,
+      "learning_rate": 0.0002,
+      "loss": 1.0751,
+      "step": 790
+    },
+    {
+      "epoch": 0.9340338587273789,
+      "grad_norm": 0.21321788430213928,
+      "learning_rate": 0.0002,
+      "loss": 1.2201,
+      "step": 800
+    },
+    {
+      "epoch": 0.9457092819614711,
+      "grad_norm": 0.39119839668273926,
+      "learning_rate": 0.0002,
+      "loss": 1.0977,
+      "step": 810
+    },
+    {
+      "epoch": 0.9573847051955633,
+      "grad_norm": 0.1995590776205063,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 820
+    },
+    {
+      "epoch": 0.9690601284296556,
+      "grad_norm": 0.1983078271150589,
+      "learning_rate": 0.0002,
+      "loss": 0.9257,
+      "step": 830
+    },
+    {
+      "epoch": 0.9807355516637478,
+      "grad_norm": 0.19562935829162598,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 840
+    },
+    {
+      "epoch": 0.99241097489784,
+      "grad_norm": 0.21720626950263977,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 850
+    },
+    {
+      "epoch": 0.9994162288382954,
+      "eval_loss": 1.1984628438949585,
+      "eval_runtime": 52.824,
+      "eval_samples_per_second": 8.67,
+      "eval_steps_per_second": 1.098,
+      "step": 856
+    },
+    {
+      "epoch": 1.0040863981319323,
+      "grad_norm": 0.20022626221179962,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 860
+    },
+    {
+      "epoch": 1.0157618213660244,
+      "grad_norm": 0.18347179889678955,
+      "learning_rate": 0.0002,
+      "loss": 1.1477,
+      "step": 870
+    },
+    {
+      "epoch": 1.0274372446001168,
+      "grad_norm": 0.27677398920059204,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 880
+    },
+    {
+      "epoch": 1.039112667834209,
+      "grad_norm": 0.1613788902759552,
+      "learning_rate": 0.0002,
+      "loss": 0.9135,
+      "step": 890
+    },
+    {
+      "epoch": 1.0507880910683012,
+      "grad_norm": 0.34981176257133484,
+      "learning_rate": 0.0002,
+      "loss": 0.9362,
+      "step": 900
+    },
+    {
+      "epoch": 1.0624635143023935,
+      "grad_norm": 0.2047315239906311,
+      "learning_rate": 0.0002,
+      "loss": 1.0158,
+      "step": 910
+    },
+    {
+      "epoch": 1.0741389375364856,
+      "grad_norm": 0.2312125563621521,
+      "learning_rate": 0.0002,
+      "loss": 1.0819,
+      "step": 920
+    },
+    {
+      "epoch": 1.085814360770578,
+      "grad_norm": 0.1890091598033905,
+      "learning_rate": 0.0002,
+      "loss": 0.8474,
+      "step": 930
+    },
+    {
+      "epoch": 1.0974897840046702,
+      "grad_norm": 0.2594001889228821,
+      "learning_rate": 0.0002,
+      "loss": 0.9807,
+      "step": 940
+    },
+    {
+      "epoch": 1.1091652072387623,
+      "grad_norm": 0.23180805146694183,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 950
+    },
+    {
+      "epoch": 1.1208406304728546,
+      "grad_norm": 0.3079565465450287,
+      "learning_rate": 0.0002,
+      "loss": 0.9935,
+      "step": 960
+    },
+    {
+      "epoch": 1.132516053706947,
+      "grad_norm": 0.348038911819458,
+      "learning_rate": 0.0002,
+      "loss": 1.1019,
+      "step": 970
+    },
+    {
+      "epoch": 1.144191476941039,
+      "grad_norm": 0.25485727190971375,
+      "learning_rate": 0.0002,
+      "loss": 1.0712,
+      "step": 980
+    },
+    {
+      "epoch": 1.1558669001751314,
+      "grad_norm": 0.3280978202819824,
+      "learning_rate": 0.0002,
+      "loss": 0.9455,
+      "step": 990
+    },
+    {
+      "epoch": 1.1675423234092235,
+      "grad_norm": 0.3325645327568054,
+      "learning_rate": 0.0002,
+      "loss": 0.8836,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1792177466433158,
+      "grad_norm": 0.25743699073791504,
+      "learning_rate": 0.0002,
+      "loss": 0.8908,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1908931698774081,
+      "grad_norm": 0.23885756731033325,
+      "learning_rate": 0.0002,
+      "loss": 0.9363,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2025685931115002,
+      "grad_norm": 0.2594054043292999,
+      "learning_rate": 0.0002,
+      "loss": 1.0811,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2142440163455925,
+      "grad_norm": 0.2806910276412964,
+      "learning_rate": 0.0002,
+      "loss": 0.8865,
+      "step": 1040
+    },
+    {
+      "epoch": 1.2259194395796849,
+      "grad_norm": 0.2919756770133972,
+      "learning_rate": 0.0002,
+      "loss": 0.9874,
+      "step": 1050
+    },
+    {
+      "epoch": 1.237594862813777,
+      "grad_norm": 0.2846801280975342,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 1060
+    },
+    {
+      "epoch": 1.2492702860478693,
+      "grad_norm": 0.22056721150875092,
+      "learning_rate": 0.0002,
+      "loss": 1.0518,
+      "step": 1070
+    },
+    {
+      "epoch": 1.2609457092819616,
+      "grad_norm": 0.21786770224571228,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2726211325160537,
+      "grad_norm": 0.21728235483169556,
+      "learning_rate": 0.0002,
+      "loss": 1.1127,
+      "step": 1090
+    },
+    {
+      "epoch": 1.284296555750146,
+      "grad_norm": 0.38934388756752014,
+      "learning_rate": 0.0002,
+      "loss": 0.9172,
+      "step": 1100
+    },
+    {
+      "epoch": 1.295971978984238,
+      "grad_norm": 0.4942418336868286,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3076474022183304,
+      "grad_norm": 0.22632132470607758,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1120
+    },
+    {
+      "epoch": 1.3193228254524225,
+      "grad_norm": 0.2033795416355133,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1130
+    },
+    {
+      "epoch": 1.3309982486865148,
+      "grad_norm": 0.3090406656265259,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1140
+    },
+    {
+      "epoch": 1.3426736719206072,
+      "grad_norm": 0.3908712565898895,
+      "learning_rate": 0.0002,
+      "loss": 0.9919,
+      "step": 1150
+    },
+    {
+      "epoch": 1.3543490951546993,
+      "grad_norm": 0.5885194540023804,
+      "learning_rate": 0.0002,
+      "loss": 0.9933,
+      "step": 1160
+    },
+    {
+      "epoch": 1.3660245183887916,
+      "grad_norm": 0.28344446420669556,
+      "learning_rate": 0.0002,
+      "loss": 0.9689,
+      "step": 1170
+    },
+    {
+      "epoch": 1.377699941622884,
+      "grad_norm": 0.305290162563324,
+      "learning_rate": 0.0002,
+      "loss": 0.8012,
+      "step": 1180
+    },
+    {
+      "epoch": 1.389375364856976,
+      "grad_norm": 0.47231870889663696,
+      "learning_rate": 0.0002,
+      "loss": 1.0218,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4010507880910683,
+      "grad_norm": 0.1865382194519043,
+      "learning_rate": 0.0002,
+      "loss": 0.7837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4127262113251606,
+      "grad_norm": 0.19491282105445862,
+      "learning_rate": 0.0002,
+      "loss": 0.8884,
+      "step": 1210
+    },
+    {
+      "epoch": 1.4244016345592527,
+      "grad_norm": 0.26192259788513184,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1220
+    },
+    {
+      "epoch": 1.436077057793345,
+      "grad_norm": 0.25829964876174927,
+      "learning_rate": 0.0002,
+      "loss": 0.959,
+      "step": 1230
+    },
+    {
+      "epoch": 1.4477524810274374,
+      "grad_norm": 0.24313300848007202,
+      "learning_rate": 0.0002,
+      "loss": 0.9476,
+      "step": 1240
+    },
+    {
+      "epoch": 1.4594279042615295,
+      "grad_norm": 0.18807671964168549,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1250
+    },
+    {
+      "epoch": 1.4711033274956218,
+      "grad_norm": 0.24352246522903442,
+      "learning_rate": 0.0002,
+      "loss": 0.9117,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4827787507297139,
+      "grad_norm": 0.401624470949173,
+      "learning_rate": 0.0002,
+      "loss": 0.9541,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4944541739638062,
+      "grad_norm": 0.3230941891670227,
+      "learning_rate": 0.0002,
+      "loss": 0.8756,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5061295971978983,
+      "grad_norm": 0.22052809596061707,
+      "learning_rate": 0.0002,
+      "loss": 0.8811,
+      "step": 1290
+    },
+    {
+      "epoch": 1.5178050204319906,
+      "grad_norm": 0.8212894201278687,
+      "learning_rate": 0.0002,
+      "loss": 1.0614,
+      "step": 1300
+    },
+    {
+      "epoch": 1.529480443666083,
+      "grad_norm": 0.2073482722043991,
+      "learning_rate": 0.0002,
+      "loss": 0.9205,
+      "step": 1310
+    },
+    {
+      "epoch": 1.541155866900175,
+      "grad_norm": 0.2663249373435974,
+      "learning_rate": 0.0002,
+      "loss": 0.9092,
+      "step": 1320
+    },
+    {
+      "epoch": 1.5528312901342674,
+      "grad_norm": 0.20269067585468292,
+      "learning_rate": 0.0002,
+      "loss": 0.9093,
+      "step": 1330
+    },
+    {
+      "epoch": 1.5645067133683597,
+      "grad_norm": 0.23635980486869812,
+      "learning_rate": 0.0002,
+      "loss": 0.9924,
+      "step": 1340
+    },
+    {
+      "epoch": 1.5761821366024518,
+      "grad_norm": 0.2865060865879059,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1350
+    },
+    {
+      "epoch": 1.587857559836544,
+      "grad_norm": 0.19927282631397247,
+      "learning_rate": 0.0002,
+      "loss": 0.982,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5995329830706364,
+      "grad_norm": 0.2837635278701782,
+      "learning_rate": 0.0002,
+      "loss": 0.9702,
+      "step": 1370
+    },
+    {
+      "epoch": 1.6112084063047285,
+      "grad_norm": 0.20541565120220184,
+      "learning_rate": 0.0002,
+      "loss": 0.8996,
+      "step": 1380
+    },
+    {
+      "epoch": 1.6228838295388208,
+      "grad_norm": 0.29976120591163635,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1390
+    },
+    {
+      "epoch": 1.6345592527729131,
+      "grad_norm": 0.4811157286167145,
+      "learning_rate": 0.0002,
+      "loss": 1.0675,
+      "step": 1400
+    },
+    {
+      "epoch": 1.6462346760070052,
+      "grad_norm": 0.45696231722831726,
+      "learning_rate": 0.0002,
+      "loss": 0.9802,
+      "step": 1410
+    },
+    {
+      "epoch": 1.6579100992410973,
+      "grad_norm": 0.26527705788612366,
+      "learning_rate": 0.0002,
+      "loss": 0.993,
+      "step": 1420
+    },
+    {
+      "epoch": 1.6695855224751899,
+      "grad_norm": 0.2607211172580719,
+      "learning_rate": 0.0002,
+      "loss": 0.8715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.681260945709282,
+      "grad_norm": 0.30872678756713867,
+      "learning_rate": 0.0002,
+      "loss": 0.9527,
+      "step": 1440
+    },
+    {
+      "epoch": 1.692936368943374,
+      "grad_norm": 0.378160297870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1103,
+      "step": 1450
+    },
+    {
+      "epoch": 1.7046117921774664,
+      "grad_norm": 0.4253346025943756,
+      "learning_rate": 0.0002,
+      "loss": 0.9507,
+      "step": 1460
+    },
+    {
+      "epoch": 1.7162872154115587,
+      "grad_norm": 0.23859360814094543,
+      "learning_rate": 0.0002,
+      "loss": 1.0566,
+      "step": 1470
+    },
+    {
+      "epoch": 1.7279626386456508,
+      "grad_norm": 0.4765002727508545,
+      "learning_rate": 0.0002,
+      "loss": 0.93,
+      "step": 1480
+    },
+    {
+      "epoch": 1.7396380618797431,
+      "grad_norm": 0.1958352029323578,
+      "learning_rate": 0.0002,
+      "loss": 0.9884,
+      "step": 1490
+    },
+    {
+      "epoch": 1.7513134851138354,
+      "grad_norm": 0.1772938221693039,
+      "learning_rate": 0.0002,
+      "loss": 1.0072,
+      "step": 1500
+    },
+    {
+      "epoch": 1.7629889083479275,
+      "grad_norm": 0.2589171826839447,
+      "learning_rate": 0.0002,
+      "loss": 0.9798,
+      "step": 1510
+    },
+    {
+      "epoch": 1.7746643315820199,
+      "grad_norm": 0.638349175453186,
+      "learning_rate": 0.0002,
+      "loss": 0.9079,
+      "step": 1520
+    },
+    {
+      "epoch": 1.7863397548161122,
+      "grad_norm": 0.2402309626340866,
+      "learning_rate": 0.0002,
+      "loss": 0.8141,
+      "step": 1530
+    },
+    {
+      "epoch": 1.7980151780502043,
+      "grad_norm": 0.3758494257926941,
+      "learning_rate": 0.0002,
+      "loss": 0.9535,
+      "step": 1540
+    },
+    {
+      "epoch": 1.8096906012842966,
+      "grad_norm": 0.26750659942626953,
+      "learning_rate": 0.0002,
+      "loss": 0.9862,
+      "step": 1550
+    },
+    {
+      "epoch": 1.821366024518389,
+      "grad_norm": 0.3884737193584442,
+      "learning_rate": 0.0002,
+      "loss": 0.9289,
+      "step": 1560
+    },
+    {
+      "epoch": 1.833041447752481,
+      "grad_norm": 0.2704276740550995,
+      "learning_rate": 0.0002,
+      "loss": 0.9064,
+      "step": 1570
+    },
+    {
+      "epoch": 1.844716870986573,
+      "grad_norm": 0.2269623726606369,
+      "learning_rate": 0.0002,
+      "loss": 1.0027,
+      "step": 1580
+    },
+    {
+      "epoch": 1.8563922942206657,
+      "grad_norm": 0.23369084298610687,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1590
+    },
+    {
+      "epoch": 1.8680677174547577,
+      "grad_norm": 0.34336966276168823,
+      "learning_rate": 0.0002,
+      "loss": 0.8899,
+      "step": 1600
+    },
+    {
+      "epoch": 1.8797431406888498,
+      "grad_norm": 0.638863205909729,
+      "learning_rate": 0.0002,
+      "loss": 0.9465,
+      "step": 1610
+    },
+    {
+      "epoch": 1.8914185639229422,
+      "grad_norm": 0.4810437262058258,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 1620
+    },
+    {
+      "epoch": 1.9030939871570345,
+      "grad_norm": 0.27600526809692383,
+      "learning_rate": 0.0002,
+      "loss": 0.8272,
+      "step": 1630
+    },
+    {
+      "epoch": 1.9147694103911266,
+      "grad_norm": 0.44480231404304504,
+      "learning_rate": 0.0002,
+      "loss": 0.8816,
+      "step": 1640
+    },
+    {
+      "epoch": 1.926444833625219,
+      "grad_norm": 0.29854336380958557,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1650
+    },
+    {
+      "epoch": 1.9381202568593112,
+      "grad_norm": 0.21352696418762207,
+      "learning_rate": 0.0002,
+      "loss": 0.9721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.9497956800934033,
+      "grad_norm": 0.26450464129447937,
+      "learning_rate": 0.0002,
+      "loss": 0.9225,
+      "step": 1670
+    },
+    {
+      "epoch": 1.9614711033274956,
+      "grad_norm": 0.23895719647407532,
+      "learning_rate": 0.0002,
+      "loss": 1.0356,
+      "step": 1680
+    },
+    {
+      "epoch": 1.973146526561588,
+      "grad_norm": 0.23323677480220795,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 1690
+    },
+    {
+      "epoch": 1.98482194979568,
+      "grad_norm": 0.46997103095054626,
+      "learning_rate": 0.0002,
+      "loss": 0.9857,
+      "step": 1700
+    },
+    {
+      "epoch": 1.9964973730297724,
+      "grad_norm": 0.34337419271469116,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 1710
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1868568658828735,
+      "eval_runtime": 53.25,
+      "eval_samples_per_second": 8.601,
+      "eval_steps_per_second": 1.089,
+      "step": 1713
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6848,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.462838867381453e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb077d5937aba7d77271acaac7bb330e71c022289d873a61f3df9490020d208d
+size 5688

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84d4d0a533794083b33c3a4bc878255f4483633152a8a6e565615344ccf0ea92
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58c2d98f52152853a1bdfd44cdf25c9aa1bc2d4935e8e27ebec37b3faa6108a3
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ade1c7486804700c9239db6e4cde2b6632f3bfc27e5f4bddec0cc1dfc7f7251
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2855d136fbe83e85e3c5d15febf13f96c32756a6703ba8c2d375fce43b77e6e6
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
+size 11418266

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

	@@ -0,0 +1,1849 @@

+{
+  "best_metric": 1.1868568658828735,
+  "best_model_checkpoint": "outputs-001/Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713",
+  "epoch": 2.9994162288382955,
+  "eval_steps": 10,
+  "global_step": 2569,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011675423234092236,
+      "grad_norm": 0.26495933532714844,
+      "learning_rate": 0.0002,
+      "loss": 1.5356,
+      "step": 10
+    },
+    {
+      "epoch": 0.023350846468184472,
+      "grad_norm": 0.26226115226745605,
+      "learning_rate": 0.0002,
+      "loss": 1.5736,
+      "step": 20
+    },
+    {
+      "epoch": 0.03502626970227671,
+      "grad_norm": 0.2216806709766388,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 30
+    },
+    {
+      "epoch": 0.046701692936368944,
+      "grad_norm": 0.2804628014564514,
+      "learning_rate": 0.0002,
+      "loss": 1.3899,
+      "step": 40
+    },
+    {
+      "epoch": 0.05837711617046118,
+      "grad_norm": 0.26673951745033264,
+      "learning_rate": 0.0002,
+      "loss": 1.2255,
+      "step": 50
+    },
+    {
+      "epoch": 0.07005253940455342,
+      "grad_norm": 0.22234757244586945,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 60
+    },
+    {
+      "epoch": 0.08172796263864565,
+      "grad_norm": 0.17038528621196747,
+      "learning_rate": 0.0002,
+      "loss": 0.9847,
+      "step": 70
+    },
+    {
+      "epoch": 0.09340338587273789,
+      "grad_norm": 0.22402487695217133,
+      "learning_rate": 0.0002,
+      "loss": 0.9697,
+      "step": 80
+    },
+    {
+      "epoch": 0.10507880910683012,
+      "grad_norm": 0.2240290343761444,
+      "learning_rate": 0.0002,
+      "loss": 1.1175,
+      "step": 90
+    },
+    {
+      "epoch": 0.11675423234092236,
+      "grad_norm": 0.2043554037809372,
+      "learning_rate": 0.0002,
+      "loss": 1.1355,
+      "step": 100
+    },
+    {
+      "epoch": 0.1284296555750146,
+      "grad_norm": 0.20888502895832062,
+      "learning_rate": 0.0002,
+      "loss": 0.9512,
+      "step": 110
+    },
+    {
+      "epoch": 0.14010507880910683,
+      "grad_norm": 0.47382819652557373,
+      "learning_rate": 0.0002,
+      "loss": 1.128,
+      "step": 120
+    },
+    {
+      "epoch": 0.15178050204319907,
+      "grad_norm": 0.184955894947052,
+      "learning_rate": 0.0002,
+      "loss": 1.0121,
+      "step": 130
+    },
+    {
+      "epoch": 0.1634559252772913,
+      "grad_norm": 0.22605721652507782,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 140
+    },
+    {
+      "epoch": 0.17513134851138354,
+      "grad_norm": 0.2902279496192932,
+      "learning_rate": 0.0002,
+      "loss": 1.0642,
+      "step": 150
+    },
+    {
+      "epoch": 0.18680677174547577,
+      "grad_norm": 0.21148967742919922,
+      "learning_rate": 0.0002,
+      "loss": 1.0829,
+      "step": 160
+    },
+    {
+      "epoch": 0.198482194979568,
+      "grad_norm": 0.2443981170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.1092,
+      "step": 170
+    },
+    {
+      "epoch": 0.21015761821366025,
+      "grad_norm": 0.25699228048324585,
+      "learning_rate": 0.0002,
+      "loss": 1.0163,
+      "step": 180
+    },
+    {
+      "epoch": 0.22183304144775248,
+      "grad_norm": 0.2449636310338974,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 190
+    },
+    {
+      "epoch": 0.23350846468184472,
+      "grad_norm": 0.25968459248542786,
+      "learning_rate": 0.0002,
+      "loss": 1.1952,
+      "step": 200
+    },
+    {
+      "epoch": 0.24518388791593695,
+      "grad_norm": 0.17932388186454773,
+      "learning_rate": 0.0002,
+      "loss": 1.0165,
+      "step": 210
+    },
+    {
+      "epoch": 0.2568593111500292,
+      "grad_norm": 0.22084972262382507,
+      "learning_rate": 0.0002,
+      "loss": 1.1445,
+      "step": 220
+    },
+    {
+      "epoch": 0.2685347343841214,
+      "grad_norm": 0.23466071486473083,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 230
+    },
+    {
+      "epoch": 0.28021015761821366,
+      "grad_norm": 0.20127305388450623,
+      "learning_rate": 0.0002,
+      "loss": 1.0947,
+      "step": 240
+    },
+    {
+      "epoch": 0.29188558085230587,
+      "grad_norm": 0.22740179300308228,
+      "learning_rate": 0.0002,
+      "loss": 1.1821,
+      "step": 250
+    },
+    {
+      "epoch": 0.30356100408639813,
+      "grad_norm": 0.23858675360679626,
+      "learning_rate": 0.0002,
+      "loss": 0.9888,
+      "step": 260
+    },
+    {
+      "epoch": 0.31523642732049034,
+      "grad_norm": 0.18527966737747192,
+      "learning_rate": 0.0002,
+      "loss": 1.1404,
+      "step": 270
+    },
+    {
+      "epoch": 0.3269118505545826,
+      "grad_norm": 0.20215417444705963,
+      "learning_rate": 0.0002,
+      "loss": 1.1307,
+      "step": 280
+    },
+    {
+      "epoch": 0.3385872737886748,
+      "grad_norm": 0.17396175861358643,
+      "learning_rate": 0.0002,
+      "loss": 1.1752,
+      "step": 290
+    },
+    {
+      "epoch": 0.3502626970227671,
+      "grad_norm": 0.2083478718996048,
+      "learning_rate": 0.0002,
+      "loss": 1.222,
+      "step": 300
+    },
+    {
+      "epoch": 0.3619381202568593,
+      "grad_norm": 0.26084500551223755,
+      "learning_rate": 0.0002,
+      "loss": 0.9636,
+      "step": 310
+    },
+    {
+      "epoch": 0.37361354349095155,
+      "grad_norm": 0.2090655416250229,
+      "learning_rate": 0.0002,
+      "loss": 1.0461,
+      "step": 320
+    },
+    {
+      "epoch": 0.38528896672504376,
+      "grad_norm": 0.26721376180648804,
+      "learning_rate": 0.0002,
+      "loss": 1.0545,
+      "step": 330
+    },
+    {
+      "epoch": 0.396964389959136,
+      "grad_norm": 0.2001899778842926,
+      "learning_rate": 0.0002,
+      "loss": 0.812,
+      "step": 340
+    },
+    {
+      "epoch": 0.40863981319322823,
+      "grad_norm": 0.2354399561882019,
+      "learning_rate": 0.0002,
+      "loss": 1.0476,
+      "step": 350
+    },
+    {
+      "epoch": 0.4203152364273205,
+      "grad_norm": 0.22031325101852417,
+      "learning_rate": 0.0002,
+      "loss": 1.0466,
+      "step": 360
+    },
+    {
+      "epoch": 0.4319906596614127,
+      "grad_norm": 0.21608088910579681,
+      "learning_rate": 0.0002,
+      "loss": 1.1381,
+      "step": 370
+    },
+    {
+      "epoch": 0.44366608289550497,
+      "grad_norm": 0.2018078863620758,
+      "learning_rate": 0.0002,
+      "loss": 1.0378,
+      "step": 380
+    },
+    {
+      "epoch": 0.4553415061295972,
+      "grad_norm": 0.22110284864902496,
+      "learning_rate": 0.0002,
+      "loss": 0.892,
+      "step": 390
+    },
+    {
+      "epoch": 0.46701692936368944,
+      "grad_norm": 0.23103947937488556,
+      "learning_rate": 0.0002,
+      "loss": 0.966,
+      "step": 400
+    },
+    {
+      "epoch": 0.47869235259778165,
+      "grad_norm": 0.21037138998508453,
+      "learning_rate": 0.0002,
+      "loss": 1.0522,
+      "step": 410
+    },
+    {
+      "epoch": 0.4903677758318739,
+      "grad_norm": 0.18703506886959076,
+      "learning_rate": 0.0002,
+      "loss": 1.0784,
+      "step": 420
+    },
+    {
+      "epoch": 0.5020431990659662,
+      "grad_norm": 0.22972488403320312,
+      "learning_rate": 0.0002,
+      "loss": 1.1794,
+      "step": 430
+    },
+    {
+      "epoch": 0.5137186223000584,
+      "grad_norm": 0.17576873302459717,
+      "learning_rate": 0.0002,
+      "loss": 1.0857,
+      "step": 440
+    },
+    {
+      "epoch": 0.5253940455341506,
+      "grad_norm": 0.42553630471229553,
+      "learning_rate": 0.0002,
+      "loss": 1.2453,
+      "step": 450
+    },
+    {
+      "epoch": 0.5370694687682428,
+      "grad_norm": 0.2631092071533203,
+      "learning_rate": 0.0002,
+      "loss": 1.2011,
+      "step": 460
+    },
+    {
+      "epoch": 0.5487448920023351,
+      "grad_norm": 0.22879736125469208,
+      "learning_rate": 0.0002,
+      "loss": 1.2222,
+      "step": 470
+    },
+    {
+      "epoch": 0.5604203152364273,
+      "grad_norm": 0.1826648712158203,
+      "learning_rate": 0.0002,
+      "loss": 1.3207,
+      "step": 480
+    },
+    {
+      "epoch": 0.5720957384705195,
+      "grad_norm": 0.18885228037834167,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 490
+    },
+    {
+      "epoch": 0.5837711617046117,
+      "grad_norm": 0.17247331142425537,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 500
+    },
+    {
+      "epoch": 0.5954465849387041,
+      "grad_norm": 0.19905146956443787,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 510
+    },
+    {
+      "epoch": 0.6071220081727963,
+      "grad_norm": 0.21799565851688385,
+      "learning_rate": 0.0002,
+      "loss": 0.9839,
+      "step": 520
+    },
+    {
+      "epoch": 0.6187974314068885,
+      "grad_norm": 0.2032463699579239,
+      "learning_rate": 0.0002,
+      "loss": 1.0234,
+      "step": 530
+    },
+    {
+      "epoch": 0.6304728546409807,
+      "grad_norm": 0.14968429505825043,
+      "learning_rate": 0.0002,
+      "loss": 0.9503,
+      "step": 540
+    },
+    {
+      "epoch": 0.642148277875073,
+      "grad_norm": 0.17513799667358398,
+      "learning_rate": 0.0002,
+      "loss": 0.833,
+      "step": 550
+    },
+    {
+      "epoch": 0.6538237011091652,
+      "grad_norm": 0.1893497258424759,
+      "learning_rate": 0.0002,
+      "loss": 1.0586,
+      "step": 560
+    },
+    {
+      "epoch": 0.6654991243432574,
+      "grad_norm": 0.3045499324798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1426,
+      "step": 570
+    },
+    {
+      "epoch": 0.6771745475773496,
+      "grad_norm": 0.21172650158405304,
+      "learning_rate": 0.0002,
+      "loss": 0.9317,
+      "step": 580
+    },
+    {
+      "epoch": 0.688849970811442,
+      "grad_norm": 0.20392045378684998,
+      "learning_rate": 0.0002,
+      "loss": 1.1187,
+      "step": 590
+    },
+    {
+      "epoch": 0.7005253940455342,
+      "grad_norm": 0.17182187736034393,
+      "learning_rate": 0.0002,
+      "loss": 0.9187,
+      "step": 600
+    },
+    {
+      "epoch": 0.7122008172796264,
+      "grad_norm": 0.17221297323703766,
+      "learning_rate": 0.0002,
+      "loss": 0.9988,
+      "step": 610
+    },
+    {
+      "epoch": 0.7238762405137186,
+      "grad_norm": 0.18639299273490906,
+      "learning_rate": 0.0002,
+      "loss": 1.0334,
+      "step": 620
+    },
+    {
+      "epoch": 0.7355516637478109,
+      "grad_norm": 0.16991640627384186,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 630
+    },
+    {
+      "epoch": 0.7472270869819031,
+      "grad_norm": 0.23263484239578247,
+      "learning_rate": 0.0002,
+      "loss": 0.9335,
+      "step": 640
+    },
+    {
+      "epoch": 0.7589025102159953,
+      "grad_norm": 0.16419798135757446,
+      "learning_rate": 0.0002,
+      "loss": 0.9715,
+      "step": 650
+    },
+    {
+      "epoch": 0.7705779334500875,
+      "grad_norm": 0.20663365721702576,
+      "learning_rate": 0.0002,
+      "loss": 1.0119,
+      "step": 660
+    },
+    {
+      "epoch": 0.7822533566841798,
+      "grad_norm": 0.21871459484100342,
+      "learning_rate": 0.0002,
+      "loss": 1.143,
+      "step": 670
+    },
+    {
+      "epoch": 0.793928779918272,
+      "grad_norm": 0.20669031143188477,
+      "learning_rate": 0.0002,
+      "loss": 1.0363,
+      "step": 680
+    },
+    {
+      "epoch": 0.8056042031523643,
+      "grad_norm": 0.1783137321472168,
+      "learning_rate": 0.0002,
+      "loss": 1.0825,
+      "step": 690
+    },
+    {
+      "epoch": 0.8172796263864565,
+      "grad_norm": 0.24621079862117767,
+      "learning_rate": 0.0002,
+      "loss": 1.0002,
+      "step": 700
+    },
+    {
+      "epoch": 0.8289550496205488,
+      "grad_norm": 0.22598953545093536,
+      "learning_rate": 0.0002,
+      "loss": 1.1322,
+      "step": 710
+    },
+    {
+      "epoch": 0.840630472854641,
+      "grad_norm": 0.17925500869750977,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 720
+    },
+    {
+      "epoch": 0.8523058960887332,
+      "grad_norm": 0.25278252363204956,
+      "learning_rate": 0.0002,
+      "loss": 1.0691,
+      "step": 730
+    },
+    {
+      "epoch": 0.8639813193228254,
+      "grad_norm": 0.5249322652816772,
+      "learning_rate": 0.0002,
+      "loss": 1.0791,
+      "step": 740
+    },
+    {
+      "epoch": 0.8756567425569177,
+      "grad_norm": 0.29942265152931213,
+      "learning_rate": 0.0002,
+      "loss": 1.0798,
+      "step": 750
+    },
+    {
+      "epoch": 0.8873321657910099,
+      "grad_norm": 0.2682401239871979,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 760
+    },
+    {
+      "epoch": 0.8990075890251021,
+      "grad_norm": 0.28810951113700867,
+      "learning_rate": 0.0002,
+      "loss": 1.0917,
+      "step": 770
+    },
+    {
+      "epoch": 0.9106830122591943,
+      "grad_norm": 0.24986644089221954,
+      "learning_rate": 0.0002,
+      "loss": 1.0009,
+      "step": 780
+    },
+    {
+      "epoch": 0.9223584354932867,
+      "grad_norm": 0.21351364254951477,
+      "learning_rate": 0.0002,
+      "loss": 1.0751,
+      "step": 790
+    },
+    {
+      "epoch": 0.9340338587273789,
+      "grad_norm": 0.21321788430213928,
+      "learning_rate": 0.0002,
+      "loss": 1.2201,
+      "step": 800
+    },
+    {
+      "epoch": 0.9457092819614711,
+      "grad_norm": 0.39119839668273926,
+      "learning_rate": 0.0002,
+      "loss": 1.0977,
+      "step": 810
+    },
+    {
+      "epoch": 0.9573847051955633,
+      "grad_norm": 0.1995590776205063,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 820
+    },
+    {
+      "epoch": 0.9690601284296556,
+      "grad_norm": 0.1983078271150589,
+      "learning_rate": 0.0002,
+      "loss": 0.9257,
+      "step": 830
+    },
+    {
+      "epoch": 0.9807355516637478,
+      "grad_norm": 0.19562935829162598,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 840
+    },
+    {
+      "epoch": 0.99241097489784,
+      "grad_norm": 0.21720626950263977,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 850
+    },
+    {
+      "epoch": 0.9994162288382954,
+      "eval_loss": 1.1984628438949585,
+      "eval_runtime": 52.824,
+      "eval_samples_per_second": 8.67,
+      "eval_steps_per_second": 1.098,
+      "step": 856
+    },
+    {
+      "epoch": 1.0040863981319323,
+      "grad_norm": 0.20022626221179962,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 860
+    },
+    {
+      "epoch": 1.0157618213660244,
+      "grad_norm": 0.18347179889678955,
+      "learning_rate": 0.0002,
+      "loss": 1.1477,
+      "step": 870
+    },
+    {
+      "epoch": 1.0274372446001168,
+      "grad_norm": 0.27677398920059204,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 880
+    },
+    {
+      "epoch": 1.039112667834209,
+      "grad_norm": 0.1613788902759552,
+      "learning_rate": 0.0002,
+      "loss": 0.9135,
+      "step": 890
+    },
+    {
+      "epoch": 1.0507880910683012,
+      "grad_norm": 0.34981176257133484,
+      "learning_rate": 0.0002,
+      "loss": 0.9362,
+      "step": 900
+    },
+    {
+      "epoch": 1.0624635143023935,
+      "grad_norm": 0.2047315239906311,
+      "learning_rate": 0.0002,
+      "loss": 1.0158,
+      "step": 910
+    },
+    {
+      "epoch": 1.0741389375364856,
+      "grad_norm": 0.2312125563621521,
+      "learning_rate": 0.0002,
+      "loss": 1.0819,
+      "step": 920
+    },
+    {
+      "epoch": 1.085814360770578,
+      "grad_norm": 0.1890091598033905,
+      "learning_rate": 0.0002,
+      "loss": 0.8474,
+      "step": 930
+    },
+    {
+      "epoch": 1.0974897840046702,
+      "grad_norm": 0.2594001889228821,
+      "learning_rate": 0.0002,
+      "loss": 0.9807,
+      "step": 940
+    },
+    {
+      "epoch": 1.1091652072387623,
+      "grad_norm": 0.23180805146694183,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 950
+    },
+    {
+      "epoch": 1.1208406304728546,
+      "grad_norm": 0.3079565465450287,
+      "learning_rate": 0.0002,
+      "loss": 0.9935,
+      "step": 960
+    },
+    {
+      "epoch": 1.132516053706947,
+      "grad_norm": 0.348038911819458,
+      "learning_rate": 0.0002,
+      "loss": 1.1019,
+      "step": 970
+    },
+    {
+      "epoch": 1.144191476941039,
+      "grad_norm": 0.25485727190971375,
+      "learning_rate": 0.0002,
+      "loss": 1.0712,
+      "step": 980
+    },
+    {
+      "epoch": 1.1558669001751314,
+      "grad_norm": 0.3280978202819824,
+      "learning_rate": 0.0002,
+      "loss": 0.9455,
+      "step": 990
+    },
+    {
+      "epoch": 1.1675423234092235,
+      "grad_norm": 0.3325645327568054,
+      "learning_rate": 0.0002,
+      "loss": 0.8836,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1792177466433158,
+      "grad_norm": 0.25743699073791504,
+      "learning_rate": 0.0002,
+      "loss": 0.8908,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1908931698774081,
+      "grad_norm": 0.23885756731033325,
+      "learning_rate": 0.0002,
+      "loss": 0.9363,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2025685931115002,
+      "grad_norm": 0.2594054043292999,
+      "learning_rate": 0.0002,
+      "loss": 1.0811,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2142440163455925,
+      "grad_norm": 0.2806910276412964,
+      "learning_rate": 0.0002,
+      "loss": 0.8865,
+      "step": 1040
+    },
+    {
+      "epoch": 1.2259194395796849,
+      "grad_norm": 0.2919756770133972,
+      "learning_rate": 0.0002,
+      "loss": 0.9874,
+      "step": 1050
+    },
+    {
+      "epoch": 1.237594862813777,
+      "grad_norm": 0.2846801280975342,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 1060
+    },
+    {
+      "epoch": 1.2492702860478693,
+      "grad_norm": 0.22056721150875092,
+      "learning_rate": 0.0002,
+      "loss": 1.0518,
+      "step": 1070
+    },
+    {
+      "epoch": 1.2609457092819616,
+      "grad_norm": 0.21786770224571228,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2726211325160537,
+      "grad_norm": 0.21728235483169556,
+      "learning_rate": 0.0002,
+      "loss": 1.1127,
+      "step": 1090
+    },
+    {
+      "epoch": 1.284296555750146,
+      "grad_norm": 0.38934388756752014,
+      "learning_rate": 0.0002,
+      "loss": 0.9172,
+      "step": 1100
+    },
+    {
+      "epoch": 1.295971978984238,
+      "grad_norm": 0.4942418336868286,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3076474022183304,
+      "grad_norm": 0.22632132470607758,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1120
+    },
+    {
+      "epoch": 1.3193228254524225,
+      "grad_norm": 0.2033795416355133,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1130
+    },
+    {
+      "epoch": 1.3309982486865148,
+      "grad_norm": 0.3090406656265259,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1140
+    },
+    {
+      "epoch": 1.3426736719206072,
+      "grad_norm": 0.3908712565898895,
+      "learning_rate": 0.0002,
+      "loss": 0.9919,
+      "step": 1150
+    },
+    {
+      "epoch": 1.3543490951546993,
+      "grad_norm": 0.5885194540023804,
+      "learning_rate": 0.0002,
+      "loss": 0.9933,
+      "step": 1160
+    },
+    {
+      "epoch": 1.3660245183887916,
+      "grad_norm": 0.28344446420669556,
+      "learning_rate": 0.0002,
+      "loss": 0.9689,
+      "step": 1170
+    },
+    {
+      "epoch": 1.377699941622884,
+      "grad_norm": 0.305290162563324,
+      "learning_rate": 0.0002,
+      "loss": 0.8012,
+      "step": 1180
+    },
+    {
+      "epoch": 1.389375364856976,
+      "grad_norm": 0.47231870889663696,
+      "learning_rate": 0.0002,
+      "loss": 1.0218,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4010507880910683,
+      "grad_norm": 0.1865382194519043,
+      "learning_rate": 0.0002,
+      "loss": 0.7837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4127262113251606,
+      "grad_norm": 0.19491282105445862,
+      "learning_rate": 0.0002,
+      "loss": 0.8884,
+      "step": 1210
+    },
+    {
+      "epoch": 1.4244016345592527,
+      "grad_norm": 0.26192259788513184,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1220
+    },
+    {
+      "epoch": 1.436077057793345,
+      "grad_norm": 0.25829964876174927,
+      "learning_rate": 0.0002,
+      "loss": 0.959,
+      "step": 1230
+    },
+    {
+      "epoch": 1.4477524810274374,
+      "grad_norm": 0.24313300848007202,
+      "learning_rate": 0.0002,
+      "loss": 0.9476,
+      "step": 1240
+    },
+    {
+      "epoch": 1.4594279042615295,
+      "grad_norm": 0.18807671964168549,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1250
+    },
+    {
+      "epoch": 1.4711033274956218,
+      "grad_norm": 0.24352246522903442,
+      "learning_rate": 0.0002,
+      "loss": 0.9117,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4827787507297139,
+      "grad_norm": 0.401624470949173,
+      "learning_rate": 0.0002,
+      "loss": 0.9541,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4944541739638062,
+      "grad_norm": 0.3230941891670227,
+      "learning_rate": 0.0002,
+      "loss": 0.8756,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5061295971978983,
+      "grad_norm": 0.22052809596061707,
+      "learning_rate": 0.0002,
+      "loss": 0.8811,
+      "step": 1290
+    },
+    {
+      "epoch": 1.5178050204319906,
+      "grad_norm": 0.8212894201278687,
+      "learning_rate": 0.0002,
+      "loss": 1.0614,
+      "step": 1300
+    },
+    {
+      "epoch": 1.529480443666083,
+      "grad_norm": 0.2073482722043991,
+      "learning_rate": 0.0002,
+      "loss": 0.9205,
+      "step": 1310
+    },
+    {
+      "epoch": 1.541155866900175,
+      "grad_norm": 0.2663249373435974,
+      "learning_rate": 0.0002,
+      "loss": 0.9092,
+      "step": 1320
+    },
+    {
+      "epoch": 1.5528312901342674,
+      "grad_norm": 0.20269067585468292,
+      "learning_rate": 0.0002,
+      "loss": 0.9093,
+      "step": 1330
+    },
+    {
+      "epoch": 1.5645067133683597,
+      "grad_norm": 0.23635980486869812,
+      "learning_rate": 0.0002,
+      "loss": 0.9924,
+      "step": 1340
+    },
+    {
+      "epoch": 1.5761821366024518,
+      "grad_norm": 0.2865060865879059,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1350
+    },
+    {
+      "epoch": 1.587857559836544,
+      "grad_norm": 0.19927282631397247,
+      "learning_rate": 0.0002,
+      "loss": 0.982,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5995329830706364,
+      "grad_norm": 0.2837635278701782,
+      "learning_rate": 0.0002,
+      "loss": 0.9702,
+      "step": 1370
+    },
+    {
+      "epoch": 1.6112084063047285,
+      "grad_norm": 0.20541565120220184,
+      "learning_rate": 0.0002,
+      "loss": 0.8996,
+      "step": 1380
+    },
+    {
+      "epoch": 1.6228838295388208,
+      "grad_norm": 0.29976120591163635,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1390
+    },
+    {
+      "epoch": 1.6345592527729131,
+      "grad_norm": 0.4811157286167145,
+      "learning_rate": 0.0002,
+      "loss": 1.0675,
+      "step": 1400
+    },
+    {
+      "epoch": 1.6462346760070052,
+      "grad_norm": 0.45696231722831726,
+      "learning_rate": 0.0002,
+      "loss": 0.9802,
+      "step": 1410
+    },
+    {
+      "epoch": 1.6579100992410973,
+      "grad_norm": 0.26527705788612366,
+      "learning_rate": 0.0002,
+      "loss": 0.993,
+      "step": 1420
+    },
+    {
+      "epoch": 1.6695855224751899,
+      "grad_norm": 0.2607211172580719,
+      "learning_rate": 0.0002,
+      "loss": 0.8715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.681260945709282,
+      "grad_norm": 0.30872678756713867,
+      "learning_rate": 0.0002,
+      "loss": 0.9527,
+      "step": 1440
+    },
+    {
+      "epoch": 1.692936368943374,
+      "grad_norm": 0.378160297870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1103,
+      "step": 1450
+    },
+    {
+      "epoch": 1.7046117921774664,
+      "grad_norm": 0.4253346025943756,
+      "learning_rate": 0.0002,
+      "loss": 0.9507,
+      "step": 1460
+    },
+    {
+      "epoch": 1.7162872154115587,
+      "grad_norm": 0.23859360814094543,
+      "learning_rate": 0.0002,
+      "loss": 1.0566,
+      "step": 1470
+    },
+    {
+      "epoch": 1.7279626386456508,
+      "grad_norm": 0.4765002727508545,
+      "learning_rate": 0.0002,
+      "loss": 0.93,
+      "step": 1480
+    },
+    {
+      "epoch": 1.7396380618797431,
+      "grad_norm": 0.1958352029323578,
+      "learning_rate": 0.0002,
+      "loss": 0.9884,
+      "step": 1490
+    },
+    {
+      "epoch": 1.7513134851138354,
+      "grad_norm": 0.1772938221693039,
+      "learning_rate": 0.0002,
+      "loss": 1.0072,
+      "step": 1500
+    },
+    {
+      "epoch": 1.7629889083479275,
+      "grad_norm": 0.2589171826839447,
+      "learning_rate": 0.0002,
+      "loss": 0.9798,
+      "step": 1510
+    },
+    {
+      "epoch": 1.7746643315820199,
+      "grad_norm": 0.638349175453186,
+      "learning_rate": 0.0002,
+      "loss": 0.9079,
+      "step": 1520
+    },
+    {
+      "epoch": 1.7863397548161122,
+      "grad_norm": 0.2402309626340866,
+      "learning_rate": 0.0002,
+      "loss": 0.8141,
+      "step": 1530
+    },
+    {
+      "epoch": 1.7980151780502043,
+      "grad_norm": 0.3758494257926941,
+      "learning_rate": 0.0002,
+      "loss": 0.9535,
+      "step": 1540
+    },
+    {
+      "epoch": 1.8096906012842966,
+      "grad_norm": 0.26750659942626953,
+      "learning_rate": 0.0002,
+      "loss": 0.9862,
+      "step": 1550
+    },
+    {
+      "epoch": 1.821366024518389,
+      "grad_norm": 0.3884737193584442,
+      "learning_rate": 0.0002,
+      "loss": 0.9289,
+      "step": 1560
+    },
+    {
+      "epoch": 1.833041447752481,
+      "grad_norm": 0.2704276740550995,
+      "learning_rate": 0.0002,
+      "loss": 0.9064,
+      "step": 1570
+    },
+    {
+      "epoch": 1.844716870986573,
+      "grad_norm": 0.2269623726606369,
+      "learning_rate": 0.0002,
+      "loss": 1.0027,
+      "step": 1580
+    },
+    {
+      "epoch": 1.8563922942206657,
+      "grad_norm": 0.23369084298610687,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1590
+    },
+    {
+      "epoch": 1.8680677174547577,
+      "grad_norm": 0.34336966276168823,
+      "learning_rate": 0.0002,
+      "loss": 0.8899,
+      "step": 1600
+    },
+    {
+      "epoch": 1.8797431406888498,
+      "grad_norm": 0.638863205909729,
+      "learning_rate": 0.0002,
+      "loss": 0.9465,
+      "step": 1610
+    },
+    {
+      "epoch": 1.8914185639229422,
+      "grad_norm": 0.4810437262058258,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 1620
+    },
+    {
+      "epoch": 1.9030939871570345,
+      "grad_norm": 0.27600526809692383,
+      "learning_rate": 0.0002,
+      "loss": 0.8272,
+      "step": 1630
+    },
+    {
+      "epoch": 1.9147694103911266,
+      "grad_norm": 0.44480231404304504,
+      "learning_rate": 0.0002,
+      "loss": 0.8816,
+      "step": 1640
+    },
+    {
+      "epoch": 1.926444833625219,
+      "grad_norm": 0.29854336380958557,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1650
+    },
+    {
+      "epoch": 1.9381202568593112,
+      "grad_norm": 0.21352696418762207,
+      "learning_rate": 0.0002,
+      "loss": 0.9721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.9497956800934033,
+      "grad_norm": 0.26450464129447937,
+      "learning_rate": 0.0002,
+      "loss": 0.9225,
+      "step": 1670
+    },
+    {
+      "epoch": 1.9614711033274956,
+      "grad_norm": 0.23895719647407532,
+      "learning_rate": 0.0002,
+      "loss": 1.0356,
+      "step": 1680
+    },
+    {
+      "epoch": 1.973146526561588,
+      "grad_norm": 0.23323677480220795,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 1690
+    },
+    {
+      "epoch": 1.98482194979568,
+      "grad_norm": 0.46997103095054626,
+      "learning_rate": 0.0002,
+      "loss": 0.9857,
+      "step": 1700
+    },
+    {
+      "epoch": 1.9964973730297724,
+      "grad_norm": 0.34337419271469116,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 1710
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1868568658828735,
+      "eval_runtime": 53.25,
+      "eval_samples_per_second": 8.601,
+      "eval_steps_per_second": 1.089,
+      "step": 1713
+    },
+    {
+      "epoch": 2.0081727962638647,
+      "grad_norm": 0.19537708163261414,
+      "learning_rate": 0.0002,
+      "loss": 1.0032,
+      "step": 1720
+    },
+    {
+      "epoch": 2.019848219497957,
+      "grad_norm": 0.23779849708080292,
+      "learning_rate": 0.0002,
+      "loss": 0.7753,
+      "step": 1730
+    },
+    {
+      "epoch": 2.031523642732049,
+      "grad_norm": 0.5516199469566345,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1740
+    },
+    {
+      "epoch": 2.0431990659661414,
+      "grad_norm": 0.36250197887420654,
+      "learning_rate": 0.0002,
+      "loss": 0.8137,
+      "step": 1750
+    },
+    {
+      "epoch": 2.0548744892002335,
+      "grad_norm": 0.4038652777671814,
+      "learning_rate": 0.0002,
+      "loss": 0.7819,
+      "step": 1760
+    },
+    {
+      "epoch": 2.0665499124343256,
+      "grad_norm": 0.36477968096733093,
+      "learning_rate": 0.0002,
+      "loss": 0.8192,
+      "step": 1770
+    },
+    {
+      "epoch": 2.078225335668418,
+      "grad_norm": 0.48163020610809326,
+      "learning_rate": 0.0002,
+      "loss": 0.9101,
+      "step": 1780
+    },
+    {
+      "epoch": 2.0899007589025103,
+      "grad_norm": 0.41786351799964905,
+      "learning_rate": 0.0002,
+      "loss": 0.8166,
+      "step": 1790
+    },
+    {
+      "epoch": 2.1015761821366024,
+      "grad_norm": 0.24622796475887299,
+      "learning_rate": 0.0002,
+      "loss": 0.8776,
+      "step": 1800
+    },
+    {
+      "epoch": 2.113251605370695,
+      "grad_norm": 0.2948087155818939,
+      "learning_rate": 0.0002,
+      "loss": 0.838,
+      "step": 1810
+    },
+    {
+      "epoch": 2.124927028604787,
+      "grad_norm": 0.29395580291748047,
+      "learning_rate": 0.0002,
+      "loss": 0.729,
+      "step": 1820
+    },
+    {
+      "epoch": 2.136602451838879,
+      "grad_norm": 0.4753067195415497,
+      "learning_rate": 0.0002,
+      "loss": 0.7967,
+      "step": 1830
+    },
+    {
+      "epoch": 2.148277875072971,
+      "grad_norm": 0.5675700902938843,
+      "learning_rate": 0.0002,
+      "loss": 0.892,
+      "step": 1840
+    },
+    {
+      "epoch": 2.1599532983070637,
+      "grad_norm": 0.6422085762023926,
+      "learning_rate": 0.0002,
+      "loss": 0.8901,
+      "step": 1850
+    },
+    {
+      "epoch": 2.171628721541156,
+      "grad_norm": 0.677617609500885,
+      "learning_rate": 0.0002,
+      "loss": 0.8384,
+      "step": 1860
+    },
+    {
+      "epoch": 2.183304144775248,
+      "grad_norm": 0.501675009727478,
+      "learning_rate": 0.0002,
+      "loss": 0.9367,
+      "step": 1870
+    },
+    {
+      "epoch": 2.1949795680093405,
+      "grad_norm": 0.2996771037578583,
+      "learning_rate": 0.0002,
+      "loss": 0.8267,
+      "step": 1880
+    },
+    {
+      "epoch": 2.2066549912434326,
+      "grad_norm": 0.285370796918869,
+      "learning_rate": 0.0002,
+      "loss": 0.8226,
+      "step": 1890
+    },
+    {
+      "epoch": 2.2183304144775247,
+      "grad_norm": 0.25751280784606934,
+      "learning_rate": 0.0002,
+      "loss": 0.8776,
+      "step": 1900
+    },
+    {
+      "epoch": 2.230005837711617,
+      "grad_norm": 0.5294895768165588,
+      "learning_rate": 0.0002,
+      "loss": 0.7739,
+      "step": 1910
+    },
+    {
+      "epoch": 2.2416812609457093,
+      "grad_norm": 0.5125291347503662,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1920
+    },
+    {
+      "epoch": 2.2533566841798014,
+      "grad_norm": 0.40055087208747864,
+      "learning_rate": 0.0002,
+      "loss": 0.7923,
+      "step": 1930
+    },
+    {
+      "epoch": 2.265032107413894,
+      "grad_norm": 0.32131722569465637,
+      "learning_rate": 0.0002,
+      "loss": 0.7963,
+      "step": 1940
+    },
+    {
+      "epoch": 2.276707530647986,
+      "grad_norm": 0.40105271339416504,
+      "learning_rate": 0.0002,
+      "loss": 0.918,
+      "step": 1950
+    },
+    {
+      "epoch": 2.288382953882078,
+      "grad_norm": 0.274095356464386,
+      "learning_rate": 0.0002,
+      "loss": 0.7701,
+      "step": 1960
+    },
+    {
+      "epoch": 2.3000583771161702,
+      "grad_norm": 0.6427090764045715,
+      "learning_rate": 0.0002,
+      "loss": 0.8222,
+      "step": 1970
+    },
+    {
+      "epoch": 2.3117338003502628,
+      "grad_norm": 0.32184919714927673,
+      "learning_rate": 0.0002,
+      "loss": 0.8151,
+      "step": 1980
+    },
+    {
+      "epoch": 2.323409223584355,
+      "grad_norm": 0.28641724586486816,
+      "learning_rate": 0.0002,
+      "loss": 0.791,
+      "step": 1990
+    },
+    {
+      "epoch": 2.335084646818447,
+      "grad_norm": 0.8957763314247131,
+      "learning_rate": 0.0002,
+      "loss": 0.8111,
+      "step": 2000
+    },
+    {
+      "epoch": 2.3467600700525395,
+      "grad_norm": 0.43205350637435913,
+      "learning_rate": 0.0002,
+      "loss": 0.7833,
+      "step": 2010
+    },
+    {
+      "epoch": 2.3584354932866316,
+      "grad_norm": 0.2754843831062317,
+      "learning_rate": 0.0002,
+      "loss": 0.8703,
+      "step": 2020
+    },
+    {
+      "epoch": 2.3701109165207237,
+      "grad_norm": 0.3866446316242218,
+      "learning_rate": 0.0002,
+      "loss": 0.825,
+      "step": 2030
+    },
+    {
+      "epoch": 2.3817863397548162,
+      "grad_norm": 0.596156656742096,
+      "learning_rate": 0.0002,
+      "loss": 0.9382,
+      "step": 2040
+    },
+    {
+      "epoch": 2.3934617629889083,
+      "grad_norm": 0.5955569744110107,
+      "learning_rate": 0.0002,
+      "loss": 0.8716,
+      "step": 2050
+    },
+    {
+      "epoch": 2.4051371862230004,
+      "grad_norm": 0.49891725182533264,
+      "learning_rate": 0.0002,
+      "loss": 0.77,
+      "step": 2060
+    },
+    {
+      "epoch": 2.416812609457093,
+      "grad_norm": 0.336823433637619,
+      "learning_rate": 0.0002,
+      "loss": 0.8657,
+      "step": 2070
+    },
+    {
+      "epoch": 2.428488032691185,
+      "grad_norm": 0.31427133083343506,
+      "learning_rate": 0.0002,
+      "loss": 0.8321,
+      "step": 2080
+    },
+    {
+      "epoch": 2.440163455925277,
+      "grad_norm": 0.6004841923713684,
+      "learning_rate": 0.0002,
+      "loss": 0.8567,
+      "step": 2090
+    },
+    {
+      "epoch": 2.4518388791593697,
+      "grad_norm": 0.6182882189750671,
+      "learning_rate": 0.0002,
+      "loss": 0.8419,
+      "step": 2100
+    },
+    {
+      "epoch": 2.463514302393462,
+      "grad_norm": 0.4464357793331146,
+      "learning_rate": 0.0002,
+      "loss": 0.7655,
+      "step": 2110
+    },
+    {
+      "epoch": 2.475189725627554,
+      "grad_norm": 0.26698681712150574,
+      "learning_rate": 0.0002,
+      "loss": 0.7783,
+      "step": 2120
+    },
+    {
+      "epoch": 2.4868651488616464,
+      "grad_norm": 0.32835668325424194,
+      "learning_rate": 0.0002,
+      "loss": 0.8636,
+      "step": 2130
+    },
+    {
+      "epoch": 2.4985405720957385,
+      "grad_norm": 0.30060240626335144,
+      "learning_rate": 0.0002,
+      "loss": 0.7974,
+      "step": 2140
+    },
+    {
+      "epoch": 2.5102159953298306,
+      "grad_norm": 0.5971834659576416,
+      "learning_rate": 0.0002,
+      "loss": 0.8365,
+      "step": 2150
+    },
+    {
+      "epoch": 2.521891418563923,
+      "grad_norm": 0.29032406210899353,
+      "learning_rate": 0.0002,
+      "loss": 0.8427,
+      "step": 2160
+    },
+    {
+      "epoch": 2.5335668417980153,
+      "grad_norm": 0.3044188916683197,
+      "learning_rate": 0.0002,
+      "loss": 0.8321,
+      "step": 2170
+    },
+    {
+      "epoch": 2.5452422650321074,
+      "grad_norm": 0.5061913728713989,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 2180
+    },
+    {
+      "epoch": 2.5569176882662,
+      "grad_norm": 0.3165229856967926,
+      "learning_rate": 0.0002,
+      "loss": 0.9161,
+      "step": 2190
+    },
+    {
+      "epoch": 2.568593111500292,
+      "grad_norm": 0.5463014841079712,
+      "learning_rate": 0.0002,
+      "loss": 0.9278,
+      "step": 2200
+    },
+    {
+      "epoch": 2.580268534734384,
+      "grad_norm": 0.28532662987709045,
+      "learning_rate": 0.0002,
+      "loss": 0.7847,
+      "step": 2210
+    },
+    {
+      "epoch": 2.591943957968476,
+      "grad_norm": 0.2705112397670746,
+      "learning_rate": 0.0002,
+      "loss": 0.8256,
+      "step": 2220
+    },
+    {
+      "epoch": 2.6036193812025687,
+      "grad_norm": 0.7311036586761475,
+      "learning_rate": 0.0002,
+      "loss": 0.8753,
+      "step": 2230
+    },
+    {
+      "epoch": 2.615294804436661,
+      "grad_norm": 0.31091684103012085,
+      "learning_rate": 0.0002,
+      "loss": 0.8641,
+      "step": 2240
+    },
+    {
+      "epoch": 2.626970227670753,
+      "grad_norm": 0.3427600860595703,
+      "learning_rate": 0.0002,
+      "loss": 0.7109,
+      "step": 2250
+    },
+    {
+      "epoch": 2.638645650904845,
+      "grad_norm": 0.426582932472229,
+      "learning_rate": 0.0002,
+      "loss": 0.8552,
+      "step": 2260
+    },
+    {
+      "epoch": 2.6503210741389376,
+      "grad_norm": 0.6608081459999084,
+      "learning_rate": 0.0002,
+      "loss": 0.8991,
+      "step": 2270
+    },
+    {
+      "epoch": 2.6619964973730297,
+      "grad_norm": 0.8316800594329834,
+      "learning_rate": 0.0002,
+      "loss": 0.8202,
+      "step": 2280
+    },
+    {
+      "epoch": 2.6736719206071218,
+      "grad_norm": 0.3304220139980316,
+      "learning_rate": 0.0002,
+      "loss": 0.7936,
+      "step": 2290
+    },
+    {
+      "epoch": 2.6853473438412143,
+      "grad_norm": 0.3448123335838318,
+      "learning_rate": 0.0002,
+      "loss": 0.9264,
+      "step": 2300
+    },
+    {
+      "epoch": 2.6970227670753064,
+      "grad_norm": 0.35891813039779663,
+      "learning_rate": 0.0002,
+      "loss": 0.7975,
+      "step": 2310
+    },
+    {
+      "epoch": 2.7086981903093985,
+      "grad_norm": 0.4558456540107727,
+      "learning_rate": 0.0002,
+      "loss": 0.8882,
+      "step": 2320
+    },
+    {
+      "epoch": 2.720373613543491,
+      "grad_norm": 0.2969972491264343,
+      "learning_rate": 0.0002,
+      "loss": 0.8277,
+      "step": 2330
+    },
+    {
+      "epoch": 2.732049036777583,
+      "grad_norm": 0.5421506762504578,
+      "learning_rate": 0.0002,
+      "loss": 0.8228,
+      "step": 2340
+    },
+    {
+      "epoch": 2.7437244600116752,
+      "grad_norm": 0.6532469987869263,
+      "learning_rate": 0.0002,
+      "loss": 0.8908,
+      "step": 2350
+    },
+    {
+      "epoch": 2.755399883245768,
+      "grad_norm": 0.30502063035964966,
+      "learning_rate": 0.0002,
+      "loss": 0.8642,
+      "step": 2360
+    },
+    {
+      "epoch": 2.76707530647986,
+      "grad_norm": 0.28669285774230957,
+      "learning_rate": 0.0002,
+      "loss": 0.8399,
+      "step": 2370
+    },
+    {
+      "epoch": 2.778750729713952,
+      "grad_norm": 0.38026052713394165,
+      "learning_rate": 0.0002,
+      "loss": 0.7736,
+      "step": 2380
+    },
+    {
+      "epoch": 2.7904261529480445,
+      "grad_norm": 0.5903686285018921,
+      "learning_rate": 0.0002,
+      "loss": 0.8979,
+      "step": 2390
+    },
+    {
+      "epoch": 2.8021015761821366,
+      "grad_norm": 0.49472540616989136,
+      "learning_rate": 0.0002,
+      "loss": 0.8518,
+      "step": 2400
+    },
+    {
+      "epoch": 2.8137769994162287,
+      "grad_norm": 0.4611932933330536,
+      "learning_rate": 0.0002,
+      "loss": 0.7729,
+      "step": 2410
+    },
+    {
+      "epoch": 2.8254524226503213,
+      "grad_norm": 0.4907233715057373,
+      "learning_rate": 0.0002,
+      "loss": 0.7701,
+      "step": 2420
+    },
+    {
+      "epoch": 2.8371278458844134,
+      "grad_norm": 0.2857356667518616,
+      "learning_rate": 0.0002,
+      "loss": 0.7011,
+      "step": 2430
+    },
+    {
+      "epoch": 2.8488032691185055,
+      "grad_norm": 0.2729904353618622,
+      "learning_rate": 0.0002,
+      "loss": 0.8805,
+      "step": 2440
+    },
+    {
+      "epoch": 2.860478692352598,
+      "grad_norm": 0.4903719425201416,
+      "learning_rate": 0.0002,
+      "loss": 0.875,
+      "step": 2450
+    },
+    {
+      "epoch": 2.87215411558669,
+      "grad_norm": 0.3039948344230652,
+      "learning_rate": 0.0002,
+      "loss": 0.8397,
+      "step": 2460
+    },
+    {
+      "epoch": 2.883829538820782,
+      "grad_norm": 0.5554929971694946,
+      "learning_rate": 0.0002,
+      "loss": 0.8152,
+      "step": 2470
+    },
+    {
+      "epoch": 2.8955049620548747,
+      "grad_norm": 0.5474334359169006,
+      "learning_rate": 0.0002,
+      "loss": 0.9162,
+      "step": 2480
+    },
+    {
+      "epoch": 2.907180385288967,
+      "grad_norm": 0.44103026390075684,
+      "learning_rate": 0.0002,
+      "loss": 0.9273,
+      "step": 2490
+    },
+    {
+      "epoch": 2.918855808523059,
+      "grad_norm": 0.2763408422470093,
+      "learning_rate": 0.0002,
+      "loss": 0.7229,
+      "step": 2500
+    },
+    {
+      "epoch": 2.9305312317571515,
+      "grad_norm": 0.7191962599754333,
+      "learning_rate": 0.0002,
+      "loss": 0.8002,
+      "step": 2510
+    },
+    {
+      "epoch": 2.9422066549912436,
+      "grad_norm": 0.5265306234359741,
+      "learning_rate": 0.0002,
+      "loss": 0.849,
+      "step": 2520
+    },
+    {
+      "epoch": 2.9538820782253357,
+      "grad_norm": 0.4153187870979309,
+      "learning_rate": 0.0002,
+      "loss": 0.8953,
+      "step": 2530
+    },
+    {
+      "epoch": 2.9655575014594278,
+      "grad_norm": 0.5972274541854858,
+      "learning_rate": 0.0002,
+      "loss": 0.9053,
+      "step": 2540
+    },
+    {
+      "epoch": 2.9772329246935203,
+      "grad_norm": 0.47656795382499695,
+      "learning_rate": 0.0002,
+      "loss": 0.8102,
+      "step": 2550
+    },
+    {
+      "epoch": 2.9889083479276124,
+      "grad_norm": 0.5227161049842834,
+      "learning_rate": 0.0002,
+      "loss": 0.9021,
+      "step": 2560
+    },
+    {
+      "epoch": 2.9994162288382955,
+      "eval_loss": 1.19550621509552,
+      "eval_runtime": 40.3059,
+      "eval_samples_per_second": 11.363,
+      "eval_steps_per_second": 1.439,
+      "step": 2569
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6848,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.119425830107218e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb077d5937aba7d77271acaac7bb330e71c022289d873a61f3df9490020d208d
+size 5688

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3329a66587ca74dddcb5c88d9c3a0259d705dcd76a147811b95b1baa12a9cc3e
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f877cf6fbdd98303ff3f655d6b55f6b34ab9cd91a6db20999fa90197d9e81494
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acd63b0fb195161854ed65e2000444d0ad713c0bdbfd538dc1d5f469238569ca
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0db4d97979391aca22eb7d270ee5671b3c42c0ec3249d177a631f249d4d1780
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
+size 11418266

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

	@@ -0,0 +1,2459 @@

+{
+  "best_metric": 1.1868568658828735,
+  "best_model_checkpoint": "outputs-001/Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1713",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 3426,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011675423234092236,
+      "grad_norm": 0.26495933532714844,
+      "learning_rate": 0.0002,
+      "loss": 1.5356,
+      "step": 10
+    },
+    {
+      "epoch": 0.023350846468184472,
+      "grad_norm": 0.26226115226745605,
+      "learning_rate": 0.0002,
+      "loss": 1.5736,
+      "step": 20
+    },
+    {
+      "epoch": 0.03502626970227671,
+      "grad_norm": 0.2216806709766388,
+      "learning_rate": 0.0002,
+      "loss": 1.2195,
+      "step": 30
+    },
+    {
+      "epoch": 0.046701692936368944,
+      "grad_norm": 0.2804628014564514,
+      "learning_rate": 0.0002,
+      "loss": 1.3899,
+      "step": 40
+    },
+    {
+      "epoch": 0.05837711617046118,
+      "grad_norm": 0.26673951745033264,
+      "learning_rate": 0.0002,
+      "loss": 1.2255,
+      "step": 50
+    },
+    {
+      "epoch": 0.07005253940455342,
+      "grad_norm": 0.22234757244586945,
+      "learning_rate": 0.0002,
+      "loss": 1.2042,
+      "step": 60
+    },
+    {
+      "epoch": 0.08172796263864565,
+      "grad_norm": 0.17038528621196747,
+      "learning_rate": 0.0002,
+      "loss": 0.9847,
+      "step": 70
+    },
+    {
+      "epoch": 0.09340338587273789,
+      "grad_norm": 0.22402487695217133,
+      "learning_rate": 0.0002,
+      "loss": 0.9697,
+      "step": 80
+    },
+    {
+      "epoch": 0.10507880910683012,
+      "grad_norm": 0.2240290343761444,
+      "learning_rate": 0.0002,
+      "loss": 1.1175,
+      "step": 90
+    },
+    {
+      "epoch": 0.11675423234092236,
+      "grad_norm": 0.2043554037809372,
+      "learning_rate": 0.0002,
+      "loss": 1.1355,
+      "step": 100
+    },
+    {
+      "epoch": 0.1284296555750146,
+      "grad_norm": 0.20888502895832062,
+      "learning_rate": 0.0002,
+      "loss": 0.9512,
+      "step": 110
+    },
+    {
+      "epoch": 0.14010507880910683,
+      "grad_norm": 0.47382819652557373,
+      "learning_rate": 0.0002,
+      "loss": 1.128,
+      "step": 120
+    },
+    {
+      "epoch": 0.15178050204319907,
+      "grad_norm": 0.184955894947052,
+      "learning_rate": 0.0002,
+      "loss": 1.0121,
+      "step": 130
+    },
+    {
+      "epoch": 0.1634559252772913,
+      "grad_norm": 0.22605721652507782,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 140
+    },
+    {
+      "epoch": 0.17513134851138354,
+      "grad_norm": 0.2902279496192932,
+      "learning_rate": 0.0002,
+      "loss": 1.0642,
+      "step": 150
+    },
+    {
+      "epoch": 0.18680677174547577,
+      "grad_norm": 0.21148967742919922,
+      "learning_rate": 0.0002,
+      "loss": 1.0829,
+      "step": 160
+    },
+    {
+      "epoch": 0.198482194979568,
+      "grad_norm": 0.2443981170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.1092,
+      "step": 170
+    },
+    {
+      "epoch": 0.21015761821366025,
+      "grad_norm": 0.25699228048324585,
+      "learning_rate": 0.0002,
+      "loss": 1.0163,
+      "step": 180
+    },
+    {
+      "epoch": 0.22183304144775248,
+      "grad_norm": 0.2449636310338974,
+      "learning_rate": 0.0002,
+      "loss": 1.141,
+      "step": 190
+    },
+    {
+      "epoch": 0.23350846468184472,
+      "grad_norm": 0.25968459248542786,
+      "learning_rate": 0.0002,
+      "loss": 1.1952,
+      "step": 200
+    },
+    {
+      "epoch": 0.24518388791593695,
+      "grad_norm": 0.17932388186454773,
+      "learning_rate": 0.0002,
+      "loss": 1.0165,
+      "step": 210
+    },
+    {
+      "epoch": 0.2568593111500292,
+      "grad_norm": 0.22084972262382507,
+      "learning_rate": 0.0002,
+      "loss": 1.1445,
+      "step": 220
+    },
+    {
+      "epoch": 0.2685347343841214,
+      "grad_norm": 0.23466071486473083,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 230
+    },
+    {
+      "epoch": 0.28021015761821366,
+      "grad_norm": 0.20127305388450623,
+      "learning_rate": 0.0002,
+      "loss": 1.0947,
+      "step": 240
+    },
+    {
+      "epoch": 0.29188558085230587,
+      "grad_norm": 0.22740179300308228,
+      "learning_rate": 0.0002,
+      "loss": 1.1821,
+      "step": 250
+    },
+    {
+      "epoch": 0.30356100408639813,
+      "grad_norm": 0.23858675360679626,
+      "learning_rate": 0.0002,
+      "loss": 0.9888,
+      "step": 260
+    },
+    {
+      "epoch": 0.31523642732049034,
+      "grad_norm": 0.18527966737747192,
+      "learning_rate": 0.0002,
+      "loss": 1.1404,
+      "step": 270
+    },
+    {
+      "epoch": 0.3269118505545826,
+      "grad_norm": 0.20215417444705963,
+      "learning_rate": 0.0002,
+      "loss": 1.1307,
+      "step": 280
+    },
+    {
+      "epoch": 0.3385872737886748,
+      "grad_norm": 0.17396175861358643,
+      "learning_rate": 0.0002,
+      "loss": 1.1752,
+      "step": 290
+    },
+    {
+      "epoch": 0.3502626970227671,
+      "grad_norm": 0.2083478718996048,
+      "learning_rate": 0.0002,
+      "loss": 1.222,
+      "step": 300
+    },
+    {
+      "epoch": 0.3619381202568593,
+      "grad_norm": 0.26084500551223755,
+      "learning_rate": 0.0002,
+      "loss": 0.9636,
+      "step": 310
+    },
+    {
+      "epoch": 0.37361354349095155,
+      "grad_norm": 0.2090655416250229,
+      "learning_rate": 0.0002,
+      "loss": 1.0461,
+      "step": 320
+    },
+    {
+      "epoch": 0.38528896672504376,
+      "grad_norm": 0.26721376180648804,
+      "learning_rate": 0.0002,
+      "loss": 1.0545,
+      "step": 330
+    },
+    {
+      "epoch": 0.396964389959136,
+      "grad_norm": 0.2001899778842926,
+      "learning_rate": 0.0002,
+      "loss": 0.812,
+      "step": 340
+    },
+    {
+      "epoch": 0.40863981319322823,
+      "grad_norm": 0.2354399561882019,
+      "learning_rate": 0.0002,
+      "loss": 1.0476,
+      "step": 350
+    },
+    {
+      "epoch": 0.4203152364273205,
+      "grad_norm": 0.22031325101852417,
+      "learning_rate": 0.0002,
+      "loss": 1.0466,
+      "step": 360
+    },
+    {
+      "epoch": 0.4319906596614127,
+      "grad_norm": 0.21608088910579681,
+      "learning_rate": 0.0002,
+      "loss": 1.1381,
+      "step": 370
+    },
+    {
+      "epoch": 0.44366608289550497,
+      "grad_norm": 0.2018078863620758,
+      "learning_rate": 0.0002,
+      "loss": 1.0378,
+      "step": 380
+    },
+    {
+      "epoch": 0.4553415061295972,
+      "grad_norm": 0.22110284864902496,
+      "learning_rate": 0.0002,
+      "loss": 0.892,
+      "step": 390
+    },
+    {
+      "epoch": 0.46701692936368944,
+      "grad_norm": 0.23103947937488556,
+      "learning_rate": 0.0002,
+      "loss": 0.966,
+      "step": 400
+    },
+    {
+      "epoch": 0.47869235259778165,
+      "grad_norm": 0.21037138998508453,
+      "learning_rate": 0.0002,
+      "loss": 1.0522,
+      "step": 410
+    },
+    {
+      "epoch": 0.4903677758318739,
+      "grad_norm": 0.18703506886959076,
+      "learning_rate": 0.0002,
+      "loss": 1.0784,
+      "step": 420
+    },
+    {
+      "epoch": 0.5020431990659662,
+      "grad_norm": 0.22972488403320312,
+      "learning_rate": 0.0002,
+      "loss": 1.1794,
+      "step": 430
+    },
+    {
+      "epoch": 0.5137186223000584,
+      "grad_norm": 0.17576873302459717,
+      "learning_rate": 0.0002,
+      "loss": 1.0857,
+      "step": 440
+    },
+    {
+      "epoch": 0.5253940455341506,
+      "grad_norm": 0.42553630471229553,
+      "learning_rate": 0.0002,
+      "loss": 1.2453,
+      "step": 450
+    },
+    {
+      "epoch": 0.5370694687682428,
+      "grad_norm": 0.2631092071533203,
+      "learning_rate": 0.0002,
+      "loss": 1.2011,
+      "step": 460
+    },
+    {
+      "epoch": 0.5487448920023351,
+      "grad_norm": 0.22879736125469208,
+      "learning_rate": 0.0002,
+      "loss": 1.2222,
+      "step": 470
+    },
+    {
+      "epoch": 0.5604203152364273,
+      "grad_norm": 0.1826648712158203,
+      "learning_rate": 0.0002,
+      "loss": 1.3207,
+      "step": 480
+    },
+    {
+      "epoch": 0.5720957384705195,
+      "grad_norm": 0.18885228037834167,
+      "learning_rate": 0.0002,
+      "loss": 0.9321,
+      "step": 490
+    },
+    {
+      "epoch": 0.5837711617046117,
+      "grad_norm": 0.17247331142425537,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 500
+    },
+    {
+      "epoch": 0.5954465849387041,
+      "grad_norm": 0.19905146956443787,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 510
+    },
+    {
+      "epoch": 0.6071220081727963,
+      "grad_norm": 0.21799565851688385,
+      "learning_rate": 0.0002,
+      "loss": 0.9839,
+      "step": 520
+    },
+    {
+      "epoch": 0.6187974314068885,
+      "grad_norm": 0.2032463699579239,
+      "learning_rate": 0.0002,
+      "loss": 1.0234,
+      "step": 530
+    },
+    {
+      "epoch": 0.6304728546409807,
+      "grad_norm": 0.14968429505825043,
+      "learning_rate": 0.0002,
+      "loss": 0.9503,
+      "step": 540
+    },
+    {
+      "epoch": 0.642148277875073,
+      "grad_norm": 0.17513799667358398,
+      "learning_rate": 0.0002,
+      "loss": 0.833,
+      "step": 550
+    },
+    {
+      "epoch": 0.6538237011091652,
+      "grad_norm": 0.1893497258424759,
+      "learning_rate": 0.0002,
+      "loss": 1.0586,
+      "step": 560
+    },
+    {
+      "epoch": 0.6654991243432574,
+      "grad_norm": 0.3045499324798584,
+      "learning_rate": 0.0002,
+      "loss": 1.1426,
+      "step": 570
+    },
+    {
+      "epoch": 0.6771745475773496,
+      "grad_norm": 0.21172650158405304,
+      "learning_rate": 0.0002,
+      "loss": 0.9317,
+      "step": 580
+    },
+    {
+      "epoch": 0.688849970811442,
+      "grad_norm": 0.20392045378684998,
+      "learning_rate": 0.0002,
+      "loss": 1.1187,
+      "step": 590
+    },
+    {
+      "epoch": 0.7005253940455342,
+      "grad_norm": 0.17182187736034393,
+      "learning_rate": 0.0002,
+      "loss": 0.9187,
+      "step": 600
+    },
+    {
+      "epoch": 0.7122008172796264,
+      "grad_norm": 0.17221297323703766,
+      "learning_rate": 0.0002,
+      "loss": 0.9988,
+      "step": 610
+    },
+    {
+      "epoch": 0.7238762405137186,
+      "grad_norm": 0.18639299273490906,
+      "learning_rate": 0.0002,
+      "loss": 1.0334,
+      "step": 620
+    },
+    {
+      "epoch": 0.7355516637478109,
+      "grad_norm": 0.16991640627384186,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 630
+    },
+    {
+      "epoch": 0.7472270869819031,
+      "grad_norm": 0.23263484239578247,
+      "learning_rate": 0.0002,
+      "loss": 0.9335,
+      "step": 640
+    },
+    {
+      "epoch": 0.7589025102159953,
+      "grad_norm": 0.16419798135757446,
+      "learning_rate": 0.0002,
+      "loss": 0.9715,
+      "step": 650
+    },
+    {
+      "epoch": 0.7705779334500875,
+      "grad_norm": 0.20663365721702576,
+      "learning_rate": 0.0002,
+      "loss": 1.0119,
+      "step": 660
+    },
+    {
+      "epoch": 0.7822533566841798,
+      "grad_norm": 0.21871459484100342,
+      "learning_rate": 0.0002,
+      "loss": 1.143,
+      "step": 670
+    },
+    {
+      "epoch": 0.793928779918272,
+      "grad_norm": 0.20669031143188477,
+      "learning_rate": 0.0002,
+      "loss": 1.0363,
+      "step": 680
+    },
+    {
+      "epoch": 0.8056042031523643,
+      "grad_norm": 0.1783137321472168,
+      "learning_rate": 0.0002,
+      "loss": 1.0825,
+      "step": 690
+    },
+    {
+      "epoch": 0.8172796263864565,
+      "grad_norm": 0.24621079862117767,
+      "learning_rate": 0.0002,
+      "loss": 1.0002,
+      "step": 700
+    },
+    {
+      "epoch": 0.8289550496205488,
+      "grad_norm": 0.22598953545093536,
+      "learning_rate": 0.0002,
+      "loss": 1.1322,
+      "step": 710
+    },
+    {
+      "epoch": 0.840630472854641,
+      "grad_norm": 0.17925500869750977,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 720
+    },
+    {
+      "epoch": 0.8523058960887332,
+      "grad_norm": 0.25278252363204956,
+      "learning_rate": 0.0002,
+      "loss": 1.0691,
+      "step": 730
+    },
+    {
+      "epoch": 0.8639813193228254,
+      "grad_norm": 0.5249322652816772,
+      "learning_rate": 0.0002,
+      "loss": 1.0791,
+      "step": 740
+    },
+    {
+      "epoch": 0.8756567425569177,
+      "grad_norm": 0.29942265152931213,
+      "learning_rate": 0.0002,
+      "loss": 1.0798,
+      "step": 750
+    },
+    {
+      "epoch": 0.8873321657910099,
+      "grad_norm": 0.2682401239871979,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 760
+    },
+    {
+      "epoch": 0.8990075890251021,
+      "grad_norm": 0.28810951113700867,
+      "learning_rate": 0.0002,
+      "loss": 1.0917,
+      "step": 770
+    },
+    {
+      "epoch": 0.9106830122591943,
+      "grad_norm": 0.24986644089221954,
+      "learning_rate": 0.0002,
+      "loss": 1.0009,
+      "step": 780
+    },
+    {
+      "epoch": 0.9223584354932867,
+      "grad_norm": 0.21351364254951477,
+      "learning_rate": 0.0002,
+      "loss": 1.0751,
+      "step": 790
+    },
+    {
+      "epoch": 0.9340338587273789,
+      "grad_norm": 0.21321788430213928,
+      "learning_rate": 0.0002,
+      "loss": 1.2201,
+      "step": 800
+    },
+    {
+      "epoch": 0.9457092819614711,
+      "grad_norm": 0.39119839668273926,
+      "learning_rate": 0.0002,
+      "loss": 1.0977,
+      "step": 810
+    },
+    {
+      "epoch": 0.9573847051955633,
+      "grad_norm": 0.1995590776205063,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 820
+    },
+    {
+      "epoch": 0.9690601284296556,
+      "grad_norm": 0.1983078271150589,
+      "learning_rate": 0.0002,
+      "loss": 0.9257,
+      "step": 830
+    },
+    {
+      "epoch": 0.9807355516637478,
+      "grad_norm": 0.19562935829162598,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 840
+    },
+    {
+      "epoch": 0.99241097489784,
+      "grad_norm": 0.21720626950263977,
+      "learning_rate": 0.0002,
+      "loss": 1.2414,
+      "step": 850
+    },
+    {
+      "epoch": 0.9994162288382954,
+      "eval_loss": 1.1984628438949585,
+      "eval_runtime": 52.824,
+      "eval_samples_per_second": 8.67,
+      "eval_steps_per_second": 1.098,
+      "step": 856
+    },
+    {
+      "epoch": 1.0040863981319323,
+      "grad_norm": 0.20022626221179962,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 860
+    },
+    {
+      "epoch": 1.0157618213660244,
+      "grad_norm": 0.18347179889678955,
+      "learning_rate": 0.0002,
+      "loss": 1.1477,
+      "step": 870
+    },
+    {
+      "epoch": 1.0274372446001168,
+      "grad_norm": 0.27677398920059204,
+      "learning_rate": 0.0002,
+      "loss": 1.0021,
+      "step": 880
+    },
+    {
+      "epoch": 1.039112667834209,
+      "grad_norm": 0.1613788902759552,
+      "learning_rate": 0.0002,
+      "loss": 0.9135,
+      "step": 890
+    },
+    {
+      "epoch": 1.0507880910683012,
+      "grad_norm": 0.34981176257133484,
+      "learning_rate": 0.0002,
+      "loss": 0.9362,
+      "step": 900
+    },
+    {
+      "epoch": 1.0624635143023935,
+      "grad_norm": 0.2047315239906311,
+      "learning_rate": 0.0002,
+      "loss": 1.0158,
+      "step": 910
+    },
+    {
+      "epoch": 1.0741389375364856,
+      "grad_norm": 0.2312125563621521,
+      "learning_rate": 0.0002,
+      "loss": 1.0819,
+      "step": 920
+    },
+    {
+      "epoch": 1.085814360770578,
+      "grad_norm": 0.1890091598033905,
+      "learning_rate": 0.0002,
+      "loss": 0.8474,
+      "step": 930
+    },
+    {
+      "epoch": 1.0974897840046702,
+      "grad_norm": 0.2594001889228821,
+      "learning_rate": 0.0002,
+      "loss": 0.9807,
+      "step": 940
+    },
+    {
+      "epoch": 1.1091652072387623,
+      "grad_norm": 0.23180805146694183,
+      "learning_rate": 0.0002,
+      "loss": 0.9598,
+      "step": 950
+    },
+    {
+      "epoch": 1.1208406304728546,
+      "grad_norm": 0.3079565465450287,
+      "learning_rate": 0.0002,
+      "loss": 0.9935,
+      "step": 960
+    },
+    {
+      "epoch": 1.132516053706947,
+      "grad_norm": 0.348038911819458,
+      "learning_rate": 0.0002,
+      "loss": 1.1019,
+      "step": 970
+    },
+    {
+      "epoch": 1.144191476941039,
+      "grad_norm": 0.25485727190971375,
+      "learning_rate": 0.0002,
+      "loss": 1.0712,
+      "step": 980
+    },
+    {
+      "epoch": 1.1558669001751314,
+      "grad_norm": 0.3280978202819824,
+      "learning_rate": 0.0002,
+      "loss": 0.9455,
+      "step": 990
+    },
+    {
+      "epoch": 1.1675423234092235,
+      "grad_norm": 0.3325645327568054,
+      "learning_rate": 0.0002,
+      "loss": 0.8836,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1792177466433158,
+      "grad_norm": 0.25743699073791504,
+      "learning_rate": 0.0002,
+      "loss": 0.8908,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1908931698774081,
+      "grad_norm": 0.23885756731033325,
+      "learning_rate": 0.0002,
+      "loss": 0.9363,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2025685931115002,
+      "grad_norm": 0.2594054043292999,
+      "learning_rate": 0.0002,
+      "loss": 1.0811,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2142440163455925,
+      "grad_norm": 0.2806910276412964,
+      "learning_rate": 0.0002,
+      "loss": 0.8865,
+      "step": 1040
+    },
+    {
+      "epoch": 1.2259194395796849,
+      "grad_norm": 0.2919756770133972,
+      "learning_rate": 0.0002,
+      "loss": 0.9874,
+      "step": 1050
+    },
+    {
+      "epoch": 1.237594862813777,
+      "grad_norm": 0.2846801280975342,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 1060
+    },
+    {
+      "epoch": 1.2492702860478693,
+      "grad_norm": 0.22056721150875092,
+      "learning_rate": 0.0002,
+      "loss": 1.0518,
+      "step": 1070
+    },
+    {
+      "epoch": 1.2609457092819616,
+      "grad_norm": 0.21786770224571228,
+      "learning_rate": 0.0002,
+      "loss": 1.0387,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2726211325160537,
+      "grad_norm": 0.21728235483169556,
+      "learning_rate": 0.0002,
+      "loss": 1.1127,
+      "step": 1090
+    },
+    {
+      "epoch": 1.284296555750146,
+      "grad_norm": 0.38934388756752014,
+      "learning_rate": 0.0002,
+      "loss": 0.9172,
+      "step": 1100
+    },
+    {
+      "epoch": 1.295971978984238,
+      "grad_norm": 0.4942418336868286,
+      "learning_rate": 0.0002,
+      "loss": 1.0471,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3076474022183304,
+      "grad_norm": 0.22632132470607758,
+      "learning_rate": 0.0002,
+      "loss": 0.979,
+      "step": 1120
+    },
+    {
+      "epoch": 1.3193228254524225,
+      "grad_norm": 0.2033795416355133,
+      "learning_rate": 0.0002,
+      "loss": 0.9236,
+      "step": 1130
+    },
+    {
+      "epoch": 1.3309982486865148,
+      "grad_norm": 0.3090406656265259,
+      "learning_rate": 0.0002,
+      "loss": 1.022,
+      "step": 1140
+    },
+    {
+      "epoch": 1.3426736719206072,
+      "grad_norm": 0.3908712565898895,
+      "learning_rate": 0.0002,
+      "loss": 0.9919,
+      "step": 1150
+    },
+    {
+      "epoch": 1.3543490951546993,
+      "grad_norm": 0.5885194540023804,
+      "learning_rate": 0.0002,
+      "loss": 0.9933,
+      "step": 1160
+    },
+    {
+      "epoch": 1.3660245183887916,
+      "grad_norm": 0.28344446420669556,
+      "learning_rate": 0.0002,
+      "loss": 0.9689,
+      "step": 1170
+    },
+    {
+      "epoch": 1.377699941622884,
+      "grad_norm": 0.305290162563324,
+      "learning_rate": 0.0002,
+      "loss": 0.8012,
+      "step": 1180
+    },
+    {
+      "epoch": 1.389375364856976,
+      "grad_norm": 0.47231870889663696,
+      "learning_rate": 0.0002,
+      "loss": 1.0218,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4010507880910683,
+      "grad_norm": 0.1865382194519043,
+      "learning_rate": 0.0002,
+      "loss": 0.7837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4127262113251606,
+      "grad_norm": 0.19491282105445862,
+      "learning_rate": 0.0002,
+      "loss": 0.8884,
+      "step": 1210
+    },
+    {
+      "epoch": 1.4244016345592527,
+      "grad_norm": 0.26192259788513184,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1220
+    },
+    {
+      "epoch": 1.436077057793345,
+      "grad_norm": 0.25829964876174927,
+      "learning_rate": 0.0002,
+      "loss": 0.959,
+      "step": 1230
+    },
+    {
+      "epoch": 1.4477524810274374,
+      "grad_norm": 0.24313300848007202,
+      "learning_rate": 0.0002,
+      "loss": 0.9476,
+      "step": 1240
+    },
+    {
+      "epoch": 1.4594279042615295,
+      "grad_norm": 0.18807671964168549,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1250
+    },
+    {
+      "epoch": 1.4711033274956218,
+      "grad_norm": 0.24352246522903442,
+      "learning_rate": 0.0002,
+      "loss": 0.9117,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4827787507297139,
+      "grad_norm": 0.401624470949173,
+      "learning_rate": 0.0002,
+      "loss": 0.9541,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4944541739638062,
+      "grad_norm": 0.3230941891670227,
+      "learning_rate": 0.0002,
+      "loss": 0.8756,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5061295971978983,
+      "grad_norm": 0.22052809596061707,
+      "learning_rate": 0.0002,
+      "loss": 0.8811,
+      "step": 1290
+    },
+    {
+      "epoch": 1.5178050204319906,
+      "grad_norm": 0.8212894201278687,
+      "learning_rate": 0.0002,
+      "loss": 1.0614,
+      "step": 1300
+    },
+    {
+      "epoch": 1.529480443666083,
+      "grad_norm": 0.2073482722043991,
+      "learning_rate": 0.0002,
+      "loss": 0.9205,
+      "step": 1310
+    },
+    {
+      "epoch": 1.541155866900175,
+      "grad_norm": 0.2663249373435974,
+      "learning_rate": 0.0002,
+      "loss": 0.9092,
+      "step": 1320
+    },
+    {
+      "epoch": 1.5528312901342674,
+      "grad_norm": 0.20269067585468292,
+      "learning_rate": 0.0002,
+      "loss": 0.9093,
+      "step": 1330
+    },
+    {
+      "epoch": 1.5645067133683597,
+      "grad_norm": 0.23635980486869812,
+      "learning_rate": 0.0002,
+      "loss": 0.9924,
+      "step": 1340
+    },
+    {
+      "epoch": 1.5761821366024518,
+      "grad_norm": 0.2865060865879059,
+      "learning_rate": 0.0002,
+      "loss": 0.9669,
+      "step": 1350
+    },
+    {
+      "epoch": 1.587857559836544,
+      "grad_norm": 0.19927282631397247,
+      "learning_rate": 0.0002,
+      "loss": 0.982,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5995329830706364,
+      "grad_norm": 0.2837635278701782,
+      "learning_rate": 0.0002,
+      "loss": 0.9702,
+      "step": 1370
+    },
+    {
+      "epoch": 1.6112084063047285,
+      "grad_norm": 0.20541565120220184,
+      "learning_rate": 0.0002,
+      "loss": 0.8996,
+      "step": 1380
+    },
+    {
+      "epoch": 1.6228838295388208,
+      "grad_norm": 0.29976120591163635,
+      "learning_rate": 0.0002,
+      "loss": 0.9704,
+      "step": 1390
+    },
+    {
+      "epoch": 1.6345592527729131,
+      "grad_norm": 0.4811157286167145,
+      "learning_rate": 0.0002,
+      "loss": 1.0675,
+      "step": 1400
+    },
+    {
+      "epoch": 1.6462346760070052,
+      "grad_norm": 0.45696231722831726,
+      "learning_rate": 0.0002,
+      "loss": 0.9802,
+      "step": 1410
+    },
+    {
+      "epoch": 1.6579100992410973,
+      "grad_norm": 0.26527705788612366,
+      "learning_rate": 0.0002,
+      "loss": 0.993,
+      "step": 1420
+    },
+    {
+      "epoch": 1.6695855224751899,
+      "grad_norm": 0.2607211172580719,
+      "learning_rate": 0.0002,
+      "loss": 0.8715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.681260945709282,
+      "grad_norm": 0.30872678756713867,
+      "learning_rate": 0.0002,
+      "loss": 0.9527,
+      "step": 1440
+    },
+    {
+      "epoch": 1.692936368943374,
+      "grad_norm": 0.378160297870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1103,
+      "step": 1450
+    },
+    {
+      "epoch": 1.7046117921774664,
+      "grad_norm": 0.4253346025943756,
+      "learning_rate": 0.0002,
+      "loss": 0.9507,
+      "step": 1460
+    },
+    {
+      "epoch": 1.7162872154115587,
+      "grad_norm": 0.23859360814094543,
+      "learning_rate": 0.0002,
+      "loss": 1.0566,
+      "step": 1470
+    },
+    {
+      "epoch": 1.7279626386456508,
+      "grad_norm": 0.4765002727508545,
+      "learning_rate": 0.0002,
+      "loss": 0.93,
+      "step": 1480
+    },
+    {
+      "epoch": 1.7396380618797431,
+      "grad_norm": 0.1958352029323578,
+      "learning_rate": 0.0002,
+      "loss": 0.9884,
+      "step": 1490
+    },
+    {
+      "epoch": 1.7513134851138354,
+      "grad_norm": 0.1772938221693039,
+      "learning_rate": 0.0002,
+      "loss": 1.0072,
+      "step": 1500
+    },
+    {
+      "epoch": 1.7629889083479275,
+      "grad_norm": 0.2589171826839447,
+      "learning_rate": 0.0002,
+      "loss": 0.9798,
+      "step": 1510
+    },
+    {
+      "epoch": 1.7746643315820199,
+      "grad_norm": 0.638349175453186,
+      "learning_rate": 0.0002,
+      "loss": 0.9079,
+      "step": 1520
+    },
+    {
+      "epoch": 1.7863397548161122,
+      "grad_norm": 0.2402309626340866,
+      "learning_rate": 0.0002,
+      "loss": 0.8141,
+      "step": 1530
+    },
+    {
+      "epoch": 1.7980151780502043,
+      "grad_norm": 0.3758494257926941,
+      "learning_rate": 0.0002,
+      "loss": 0.9535,
+      "step": 1540
+    },
+    {
+      "epoch": 1.8096906012842966,
+      "grad_norm": 0.26750659942626953,
+      "learning_rate": 0.0002,
+      "loss": 0.9862,
+      "step": 1550
+    },
+    {
+      "epoch": 1.821366024518389,
+      "grad_norm": 0.3884737193584442,
+      "learning_rate": 0.0002,
+      "loss": 0.9289,
+      "step": 1560
+    },
+    {
+      "epoch": 1.833041447752481,
+      "grad_norm": 0.2704276740550995,
+      "learning_rate": 0.0002,
+      "loss": 0.9064,
+      "step": 1570
+    },
+    {
+      "epoch": 1.844716870986573,
+      "grad_norm": 0.2269623726606369,
+      "learning_rate": 0.0002,
+      "loss": 1.0027,
+      "step": 1580
+    },
+    {
+      "epoch": 1.8563922942206657,
+      "grad_norm": 0.23369084298610687,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1590
+    },
+    {
+      "epoch": 1.8680677174547577,
+      "grad_norm": 0.34336966276168823,
+      "learning_rate": 0.0002,
+      "loss": 0.8899,
+      "step": 1600
+    },
+    {
+      "epoch": 1.8797431406888498,
+      "grad_norm": 0.638863205909729,
+      "learning_rate": 0.0002,
+      "loss": 0.9465,
+      "step": 1610
+    },
+    {
+      "epoch": 1.8914185639229422,
+      "grad_norm": 0.4810437262058258,
+      "learning_rate": 0.0002,
+      "loss": 1.0567,
+      "step": 1620
+    },
+    {
+      "epoch": 1.9030939871570345,
+      "grad_norm": 0.27600526809692383,
+      "learning_rate": 0.0002,
+      "loss": 0.8272,
+      "step": 1630
+    },
+    {
+      "epoch": 1.9147694103911266,
+      "grad_norm": 0.44480231404304504,
+      "learning_rate": 0.0002,
+      "loss": 0.8816,
+      "step": 1640
+    },
+    {
+      "epoch": 1.926444833625219,
+      "grad_norm": 0.29854336380958557,
+      "learning_rate": 0.0002,
+      "loss": 0.8868,
+      "step": 1650
+    },
+    {
+      "epoch": 1.9381202568593112,
+      "grad_norm": 0.21352696418762207,
+      "learning_rate": 0.0002,
+      "loss": 0.9721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.9497956800934033,
+      "grad_norm": 0.26450464129447937,
+      "learning_rate": 0.0002,
+      "loss": 0.9225,
+      "step": 1670
+    },
+    {
+      "epoch": 1.9614711033274956,
+      "grad_norm": 0.23895719647407532,
+      "learning_rate": 0.0002,
+      "loss": 1.0356,
+      "step": 1680
+    },
+    {
+      "epoch": 1.973146526561588,
+      "grad_norm": 0.23323677480220795,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 1690
+    },
+    {
+      "epoch": 1.98482194979568,
+      "grad_norm": 0.46997103095054626,
+      "learning_rate": 0.0002,
+      "loss": 0.9857,
+      "step": 1700
+    },
+    {
+      "epoch": 1.9964973730297724,
+      "grad_norm": 0.34337419271469116,
+      "learning_rate": 0.0002,
+      "loss": 1.1167,
+      "step": 1710
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1868568658828735,
+      "eval_runtime": 53.25,
+      "eval_samples_per_second": 8.601,
+      "eval_steps_per_second": 1.089,
+      "step": 1713
+    },
+    {
+      "epoch": 2.0081727962638647,
+      "grad_norm": 0.19537708163261414,
+      "learning_rate": 0.0002,
+      "loss": 1.0032,
+      "step": 1720
+    },
+    {
+      "epoch": 2.019848219497957,
+      "grad_norm": 0.23779849708080292,
+      "learning_rate": 0.0002,
+      "loss": 0.7753,
+      "step": 1730
+    },
+    {
+      "epoch": 2.031523642732049,
+      "grad_norm": 0.5516199469566345,
+      "learning_rate": 0.0002,
+      "loss": 0.8975,
+      "step": 1740
+    },
+    {
+      "epoch": 2.0431990659661414,
+      "grad_norm": 0.36250197887420654,
+      "learning_rate": 0.0002,
+      "loss": 0.8137,
+      "step": 1750
+    },
+    {
+      "epoch": 2.0548744892002335,
+      "grad_norm": 0.4038652777671814,
+      "learning_rate": 0.0002,
+      "loss": 0.7819,
+      "step": 1760
+    },
+    {
+      "epoch": 2.0665499124343256,
+      "grad_norm": 0.36477968096733093,
+      "learning_rate": 0.0002,
+      "loss": 0.8192,
+      "step": 1770
+    },
+    {
+      "epoch": 2.078225335668418,
+      "grad_norm": 0.48163020610809326,
+      "learning_rate": 0.0002,
+      "loss": 0.9101,
+      "step": 1780
+    },
+    {
+      "epoch": 2.0899007589025103,
+      "grad_norm": 0.41786351799964905,
+      "learning_rate": 0.0002,
+      "loss": 0.8166,
+      "step": 1790
+    },
+    {
+      "epoch": 2.1015761821366024,
+      "grad_norm": 0.24622796475887299,
+      "learning_rate": 0.0002,
+      "loss": 0.8776,
+      "step": 1800
+    },
+    {
+      "epoch": 2.113251605370695,
+      "grad_norm": 0.2948087155818939,
+      "learning_rate": 0.0002,
+      "loss": 0.838,
+      "step": 1810
+    },
+    {
+      "epoch": 2.124927028604787,
+      "grad_norm": 0.29395580291748047,
+      "learning_rate": 0.0002,
+      "loss": 0.729,
+      "step": 1820
+    },
+    {
+      "epoch": 2.136602451838879,
+      "grad_norm": 0.4753067195415497,
+      "learning_rate": 0.0002,
+      "loss": 0.7967,
+      "step": 1830
+    },
+    {
+      "epoch": 2.148277875072971,
+      "grad_norm": 0.5675700902938843,
+      "learning_rate": 0.0002,
+      "loss": 0.892,
+      "step": 1840
+    },
+    {
+      "epoch": 2.1599532983070637,
+      "grad_norm": 0.6422085762023926,
+      "learning_rate": 0.0002,
+      "loss": 0.8901,
+      "step": 1850
+    },
+    {
+      "epoch": 2.171628721541156,
+      "grad_norm": 0.677617609500885,
+      "learning_rate": 0.0002,
+      "loss": 0.8384,
+      "step": 1860
+    },
+    {
+      "epoch": 2.183304144775248,
+      "grad_norm": 0.501675009727478,
+      "learning_rate": 0.0002,
+      "loss": 0.9367,
+      "step": 1870
+    },
+    {
+      "epoch": 2.1949795680093405,
+      "grad_norm": 0.2996771037578583,
+      "learning_rate": 0.0002,
+      "loss": 0.8267,
+      "step": 1880
+    },
+    {
+      "epoch": 2.2066549912434326,
+      "grad_norm": 0.285370796918869,
+      "learning_rate": 0.0002,
+      "loss": 0.8226,
+      "step": 1890
+    },
+    {
+      "epoch": 2.2183304144775247,
+      "grad_norm": 0.25751280784606934,
+      "learning_rate": 0.0002,
+      "loss": 0.8776,
+      "step": 1900
+    },
+    {
+      "epoch": 2.230005837711617,
+      "grad_norm": 0.5294895768165588,
+      "learning_rate": 0.0002,
+      "loss": 0.7739,
+      "step": 1910
+    },
+    {
+      "epoch": 2.2416812609457093,
+      "grad_norm": 0.5125291347503662,
+      "learning_rate": 0.0002,
+      "loss": 0.737,
+      "step": 1920
+    },
+    {
+      "epoch": 2.2533566841798014,
+      "grad_norm": 0.40055087208747864,
+      "learning_rate": 0.0002,
+      "loss": 0.7923,
+      "step": 1930
+    },
+    {
+      "epoch": 2.265032107413894,
+      "grad_norm": 0.32131722569465637,
+      "learning_rate": 0.0002,
+      "loss": 0.7963,
+      "step": 1940
+    },
+    {
+      "epoch": 2.276707530647986,
+      "grad_norm": 0.40105271339416504,
+      "learning_rate": 0.0002,
+      "loss": 0.918,
+      "step": 1950
+    },
+    {
+      "epoch": 2.288382953882078,
+      "grad_norm": 0.274095356464386,
+      "learning_rate": 0.0002,
+      "loss": 0.7701,
+      "step": 1960
+    },
+    {
+      "epoch": 2.3000583771161702,
+      "grad_norm": 0.6427090764045715,
+      "learning_rate": 0.0002,
+      "loss": 0.8222,
+      "step": 1970
+    },
+    {
+      "epoch": 2.3117338003502628,
+      "grad_norm": 0.32184919714927673,
+      "learning_rate": 0.0002,
+      "loss": 0.8151,
+      "step": 1980
+    },
+    {
+      "epoch": 2.323409223584355,
+      "grad_norm": 0.28641724586486816,
+      "learning_rate": 0.0002,
+      "loss": 0.791,
+      "step": 1990
+    },
+    {
+      "epoch": 2.335084646818447,
+      "grad_norm": 0.8957763314247131,
+      "learning_rate": 0.0002,
+      "loss": 0.8111,
+      "step": 2000
+    },
+    {
+      "epoch": 2.3467600700525395,
+      "grad_norm": 0.43205350637435913,
+      "learning_rate": 0.0002,
+      "loss": 0.7833,
+      "step": 2010
+    },
+    {
+      "epoch": 2.3584354932866316,
+      "grad_norm": 0.2754843831062317,
+      "learning_rate": 0.0002,
+      "loss": 0.8703,
+      "step": 2020
+    },
+    {
+      "epoch": 2.3701109165207237,
+      "grad_norm": 0.3866446316242218,
+      "learning_rate": 0.0002,
+      "loss": 0.825,
+      "step": 2030
+    },
+    {
+      "epoch": 2.3817863397548162,
+      "grad_norm": 0.596156656742096,
+      "learning_rate": 0.0002,
+      "loss": 0.9382,
+      "step": 2040
+    },
+    {
+      "epoch": 2.3934617629889083,
+      "grad_norm": 0.5955569744110107,
+      "learning_rate": 0.0002,
+      "loss": 0.8716,
+      "step": 2050
+    },
+    {
+      "epoch": 2.4051371862230004,
+      "grad_norm": 0.49891725182533264,
+      "learning_rate": 0.0002,
+      "loss": 0.77,
+      "step": 2060
+    },
+    {
+      "epoch": 2.416812609457093,
+      "grad_norm": 0.336823433637619,
+      "learning_rate": 0.0002,
+      "loss": 0.8657,
+      "step": 2070
+    },
+    {
+      "epoch": 2.428488032691185,
+      "grad_norm": 0.31427133083343506,
+      "learning_rate": 0.0002,
+      "loss": 0.8321,
+      "step": 2080
+    },
+    {
+      "epoch": 2.440163455925277,
+      "grad_norm": 0.6004841923713684,
+      "learning_rate": 0.0002,
+      "loss": 0.8567,
+      "step": 2090
+    },
+    {
+      "epoch": 2.4518388791593697,
+      "grad_norm": 0.6182882189750671,
+      "learning_rate": 0.0002,
+      "loss": 0.8419,
+      "step": 2100
+    },
+    {
+      "epoch": 2.463514302393462,
+      "grad_norm": 0.4464357793331146,
+      "learning_rate": 0.0002,
+      "loss": 0.7655,
+      "step": 2110
+    },
+    {
+      "epoch": 2.475189725627554,
+      "grad_norm": 0.26698681712150574,
+      "learning_rate": 0.0002,
+      "loss": 0.7783,
+      "step": 2120
+    },
+    {
+      "epoch": 2.4868651488616464,
+      "grad_norm": 0.32835668325424194,
+      "learning_rate": 0.0002,
+      "loss": 0.8636,
+      "step": 2130
+    },
+    {
+      "epoch": 2.4985405720957385,
+      "grad_norm": 0.30060240626335144,
+      "learning_rate": 0.0002,
+      "loss": 0.7974,
+      "step": 2140
+    },
+    {
+      "epoch": 2.5102159953298306,
+      "grad_norm": 0.5971834659576416,
+      "learning_rate": 0.0002,
+      "loss": 0.8365,
+      "step": 2150
+    },
+    {
+      "epoch": 2.521891418563923,
+      "grad_norm": 0.29032406210899353,
+      "learning_rate": 0.0002,
+      "loss": 0.8427,
+      "step": 2160
+    },
+    {
+      "epoch": 2.5335668417980153,
+      "grad_norm": 0.3044188916683197,
+      "learning_rate": 0.0002,
+      "loss": 0.8321,
+      "step": 2170
+    },
+    {
+      "epoch": 2.5452422650321074,
+      "grad_norm": 0.5061913728713989,
+      "learning_rate": 0.0002,
+      "loss": 0.9266,
+      "step": 2180
+    },
+    {
+      "epoch": 2.5569176882662,
+      "grad_norm": 0.3165229856967926,
+      "learning_rate": 0.0002,
+      "loss": 0.9161,
+      "step": 2190
+    },
+    {
+      "epoch": 2.568593111500292,
+      "grad_norm": 0.5463014841079712,
+      "learning_rate": 0.0002,
+      "loss": 0.9278,
+      "step": 2200
+    },
+    {
+      "epoch": 2.580268534734384,
+      "grad_norm": 0.28532662987709045,
+      "learning_rate": 0.0002,
+      "loss": 0.7847,
+      "step": 2210
+    },
+    {
+      "epoch": 2.591943957968476,
+      "grad_norm": 0.2705112397670746,
+      "learning_rate": 0.0002,
+      "loss": 0.8256,
+      "step": 2220
+    },
+    {
+      "epoch": 2.6036193812025687,
+      "grad_norm": 0.7311036586761475,
+      "learning_rate": 0.0002,
+      "loss": 0.8753,
+      "step": 2230
+    },
+    {
+      "epoch": 2.615294804436661,
+      "grad_norm": 0.31091684103012085,
+      "learning_rate": 0.0002,
+      "loss": 0.8641,
+      "step": 2240
+    },
+    {
+      "epoch": 2.626970227670753,
+      "grad_norm": 0.3427600860595703,
+      "learning_rate": 0.0002,
+      "loss": 0.7109,
+      "step": 2250
+    },
+    {
+      "epoch": 2.638645650904845,
+      "grad_norm": 0.426582932472229,
+      "learning_rate": 0.0002,
+      "loss": 0.8552,
+      "step": 2260
+    },
+    {
+      "epoch": 2.6503210741389376,
+      "grad_norm": 0.6608081459999084,
+      "learning_rate": 0.0002,
+      "loss": 0.8991,
+      "step": 2270
+    },
+    {
+      "epoch": 2.6619964973730297,
+      "grad_norm": 0.8316800594329834,
+      "learning_rate": 0.0002,
+      "loss": 0.8202,
+      "step": 2280
+    },
+    {
+      "epoch": 2.6736719206071218,
+      "grad_norm": 0.3304220139980316,
+      "learning_rate": 0.0002,
+      "loss": 0.7936,
+      "step": 2290
+    },
+    {
+      "epoch": 2.6853473438412143,
+      "grad_norm": 0.3448123335838318,
+      "learning_rate": 0.0002,
+      "loss": 0.9264,
+      "step": 2300
+    },
+    {
+      "epoch": 2.6970227670753064,
+      "grad_norm": 0.35891813039779663,
+      "learning_rate": 0.0002,
+      "loss": 0.7975,
+      "step": 2310
+    },
+    {
+      "epoch": 2.7086981903093985,
+      "grad_norm": 0.4558456540107727,
+      "learning_rate": 0.0002,
+      "loss": 0.8882,
+      "step": 2320
+    },
+    {
+      "epoch": 2.720373613543491,
+      "grad_norm": 0.2969972491264343,
+      "learning_rate": 0.0002,
+      "loss": 0.8277,
+      "step": 2330
+    },
+    {
+      "epoch": 2.732049036777583,
+      "grad_norm": 0.5421506762504578,
+      "learning_rate": 0.0002,
+      "loss": 0.8228,
+      "step": 2340
+    },
+    {
+      "epoch": 2.7437244600116752,
+      "grad_norm": 0.6532469987869263,
+      "learning_rate": 0.0002,
+      "loss": 0.8908,
+      "step": 2350
+    },
+    {
+      "epoch": 2.755399883245768,
+      "grad_norm": 0.30502063035964966,
+      "learning_rate": 0.0002,
+      "loss": 0.8642,
+      "step": 2360
+    },
+    {
+      "epoch": 2.76707530647986,
+      "grad_norm": 0.28669285774230957,
+      "learning_rate": 0.0002,
+      "loss": 0.8399,
+      "step": 2370
+    },
+    {
+      "epoch": 2.778750729713952,
+      "grad_norm": 0.38026052713394165,
+      "learning_rate": 0.0002,
+      "loss": 0.7736,
+      "step": 2380
+    },
+    {
+      "epoch": 2.7904261529480445,
+      "grad_norm": 0.5903686285018921,
+      "learning_rate": 0.0002,
+      "loss": 0.8979,
+      "step": 2390
+    },
+    {
+      "epoch": 2.8021015761821366,
+      "grad_norm": 0.49472540616989136,
+      "learning_rate": 0.0002,
+      "loss": 0.8518,
+      "step": 2400
+    },
+    {
+      "epoch": 2.8137769994162287,
+      "grad_norm": 0.4611932933330536,
+      "learning_rate": 0.0002,
+      "loss": 0.7729,
+      "step": 2410
+    },
+    {
+      "epoch": 2.8254524226503213,
+      "grad_norm": 0.4907233715057373,
+      "learning_rate": 0.0002,
+      "loss": 0.7701,
+      "step": 2420
+    },
+    {
+      "epoch": 2.8371278458844134,
+      "grad_norm": 0.2857356667518616,
+      "learning_rate": 0.0002,
+      "loss": 0.7011,
+      "step": 2430
+    },
+    {
+      "epoch": 2.8488032691185055,
+      "grad_norm": 0.2729904353618622,
+      "learning_rate": 0.0002,
+      "loss": 0.8805,
+      "step": 2440
+    },
+    {
+      "epoch": 2.860478692352598,
+      "grad_norm": 0.4903719425201416,
+      "learning_rate": 0.0002,
+      "loss": 0.875,
+      "step": 2450
+    },
+    {
+      "epoch": 2.87215411558669,
+      "grad_norm": 0.3039948344230652,
+      "learning_rate": 0.0002,
+      "loss": 0.8397,
+      "step": 2460
+    },
+    {
+      "epoch": 2.883829538820782,
+      "grad_norm": 0.5554929971694946,
+      "learning_rate": 0.0002,
+      "loss": 0.8152,
+      "step": 2470
+    },
+    {
+      "epoch": 2.8955049620548747,
+      "grad_norm": 0.5474334359169006,
+      "learning_rate": 0.0002,
+      "loss": 0.9162,
+      "step": 2480
+    },
+    {
+      "epoch": 2.907180385288967,
+      "grad_norm": 0.44103026390075684,
+      "learning_rate": 0.0002,
+      "loss": 0.9273,
+      "step": 2490
+    },
+    {
+      "epoch": 2.918855808523059,
+      "grad_norm": 0.2763408422470093,
+      "learning_rate": 0.0002,
+      "loss": 0.7229,
+      "step": 2500
+    },
+    {
+      "epoch": 2.9305312317571515,
+      "grad_norm": 0.7191962599754333,
+      "learning_rate": 0.0002,
+      "loss": 0.8002,
+      "step": 2510
+    },
+    {
+      "epoch": 2.9422066549912436,
+      "grad_norm": 0.5265306234359741,
+      "learning_rate": 0.0002,
+      "loss": 0.849,
+      "step": 2520
+    },
+    {
+      "epoch": 2.9538820782253357,
+      "grad_norm": 0.4153187870979309,
+      "learning_rate": 0.0002,
+      "loss": 0.8953,
+      "step": 2530
+    },
+    {
+      "epoch": 2.9655575014594278,
+      "grad_norm": 0.5972274541854858,
+      "learning_rate": 0.0002,
+      "loss": 0.9053,
+      "step": 2540
+    },
+    {
+      "epoch": 2.9772329246935203,
+      "grad_norm": 0.47656795382499695,
+      "learning_rate": 0.0002,
+      "loss": 0.8102,
+      "step": 2550
+    },
+    {
+      "epoch": 2.9889083479276124,
+      "grad_norm": 0.5227161049842834,
+      "learning_rate": 0.0002,
+      "loss": 0.9021,
+      "step": 2560
+    },
+    {
+      "epoch": 2.9994162288382955,
+      "eval_loss": 1.19550621509552,
+      "eval_runtime": 40.3059,
+      "eval_samples_per_second": 11.363,
+      "eval_steps_per_second": 1.439,
+      "step": 2569
+    },
+    {
+      "epoch": 3.0005837711617045,
+      "grad_norm": 1.285637617111206,
+      "learning_rate": 0.0002,
+      "loss": 0.8003,
+      "step": 2570
+    },
+    {
+      "epoch": 3.012259194395797,
+      "grad_norm": 0.3119623363018036,
+      "learning_rate": 0.0002,
+      "loss": 0.649,
+      "step": 2580
+    },
+    {
+      "epoch": 3.023934617629889,
+      "grad_norm": 1.0560064315795898,
+      "learning_rate": 0.0002,
+      "loss": 0.7191,
+      "step": 2590
+    },
+    {
+      "epoch": 3.0356100408639812,
+      "grad_norm": 0.5016117095947266,
+      "learning_rate": 0.0002,
+      "loss": 0.8072,
+      "step": 2600
+    },
+    {
+      "epoch": 3.0472854640980738,
+      "grad_norm": 0.34641745686531067,
+      "learning_rate": 0.0002,
+      "loss": 0.716,
+      "step": 2610
+    },
+    {
+      "epoch": 3.058960887332166,
+      "grad_norm": 0.45696789026260376,
+      "learning_rate": 0.0002,
+      "loss": 0.666,
+      "step": 2620
+    },
+    {
+      "epoch": 3.070636310566258,
+      "grad_norm": 0.5084333419799805,
+      "learning_rate": 0.0002,
+      "loss": 0.697,
+      "step": 2630
+    },
+    {
+      "epoch": 3.08231173380035,
+      "grad_norm": 0.34665152430534363,
+      "learning_rate": 0.0002,
+      "loss": 0.6796,
+      "step": 2640
+    },
+    {
+      "epoch": 3.0939871570344426,
+      "grad_norm": 0.4613274931907654,
+      "learning_rate": 0.0002,
+      "loss": 0.6629,
+      "step": 2650
+    },
+    {
+      "epoch": 3.1056625802685347,
+      "grad_norm": 0.3228662610054016,
+      "learning_rate": 0.0002,
+      "loss": 0.6836,
+      "step": 2660
+    },
+    {
+      "epoch": 3.117338003502627,
+      "grad_norm": 0.4133684039115906,
+      "learning_rate": 0.0002,
+      "loss": 0.6427,
+      "step": 2670
+    },
+    {
+      "epoch": 3.1290134267367193,
+      "grad_norm": 0.8400356769561768,
+      "learning_rate": 0.0002,
+      "loss": 0.7553,
+      "step": 2680
+    },
+    {
+      "epoch": 3.1406888499708114,
+      "grad_norm": 0.526075005531311,
+      "learning_rate": 0.0002,
+      "loss": 0.7498,
+      "step": 2690
+    },
+    {
+      "epoch": 3.1523642732049035,
+      "grad_norm": 0.3885576128959656,
+      "learning_rate": 0.0002,
+      "loss": 0.6413,
+      "step": 2700
+    },
+    {
+      "epoch": 3.164039696438996,
+      "grad_norm": 0.4284312129020691,
+      "learning_rate": 0.0002,
+      "loss": 0.6407,
+      "step": 2710
+    },
+    {
+      "epoch": 3.175715119673088,
+      "grad_norm": 0.39915287494659424,
+      "learning_rate": 0.0002,
+      "loss": 0.7752,
+      "step": 2720
+    },
+    {
+      "epoch": 3.1873905429071803,
+      "grad_norm": 0.6336325407028198,
+      "learning_rate": 0.0002,
+      "loss": 0.7484,
+      "step": 2730
+    },
+    {
+      "epoch": 3.199065966141273,
+      "grad_norm": 0.7586342096328735,
+      "learning_rate": 0.0002,
+      "loss": 0.6534,
+      "step": 2740
+    },
+    {
+      "epoch": 3.210741389375365,
+      "grad_norm": 0.6146870255470276,
+      "learning_rate": 0.0002,
+      "loss": 0.6995,
+      "step": 2750
+    },
+    {
+      "epoch": 3.222416812609457,
+      "grad_norm": 0.8655000925064087,
+      "learning_rate": 0.0002,
+      "loss": 0.7787,
+      "step": 2760
+    },
+    {
+      "epoch": 3.2340922358435495,
+      "grad_norm": 0.46057945489883423,
+      "learning_rate": 0.0002,
+      "loss": 0.833,
+      "step": 2770
+    },
+    {
+      "epoch": 3.2457676590776416,
+      "grad_norm": 0.8987677097320557,
+      "learning_rate": 0.0002,
+      "loss": 0.6521,
+      "step": 2780
+    },
+    {
+      "epoch": 3.2574430823117337,
+      "grad_norm": 0.41329991817474365,
+      "learning_rate": 0.0002,
+      "loss": 0.7871,
+      "step": 2790
+    },
+    {
+      "epoch": 3.2691185055458263,
+      "grad_norm": 0.6763975024223328,
+      "learning_rate": 0.0002,
+      "loss": 0.7045,
+      "step": 2800
+    },
+    {
+      "epoch": 3.2807939287799184,
+      "grad_norm": 0.5590912699699402,
+      "learning_rate": 0.0002,
+      "loss": 0.6573,
+      "step": 2810
+    },
+    {
+      "epoch": 3.2924693520140105,
+      "grad_norm": 0.8688486814498901,
+      "learning_rate": 0.0002,
+      "loss": 0.7809,
+      "step": 2820
+    },
+    {
+      "epoch": 3.3041447752481026,
+      "grad_norm": 0.999553918838501,
+      "learning_rate": 0.0002,
+      "loss": 0.7374,
+      "step": 2830
+    },
+    {
+      "epoch": 3.315820198482195,
+      "grad_norm": 0.5817554593086243,
+      "learning_rate": 0.0002,
+      "loss": 0.7531,
+      "step": 2840
+    },
+    {
+      "epoch": 3.327495621716287,
+      "grad_norm": 0.5741875171661377,
+      "learning_rate": 0.0002,
+      "loss": 0.6454,
+      "step": 2850
+    },
+    {
+      "epoch": 3.3391710449503793,
+      "grad_norm": 0.7241672873497009,
+      "learning_rate": 0.0002,
+      "loss": 0.7323,
+      "step": 2860
+    },
+    {
+      "epoch": 3.350846468184472,
+      "grad_norm": 0.8685355186462402,
+      "learning_rate": 0.0002,
+      "loss": 0.7355,
+      "step": 2870
+    },
+    {
+      "epoch": 3.362521891418564,
+      "grad_norm": 0.8707008361816406,
+      "learning_rate": 0.0002,
+      "loss": 0.7592,
+      "step": 2880
+    },
+    {
+      "epoch": 3.374197314652656,
+      "grad_norm": 0.397603839635849,
+      "learning_rate": 0.0002,
+      "loss": 0.7358,
+      "step": 2890
+    },
+    {
+      "epoch": 3.3858727378867486,
+      "grad_norm": 0.47490644454956055,
+      "learning_rate": 0.0002,
+      "loss": 0.7778,
+      "step": 2900
+    },
+    {
+      "epoch": 3.3975481611208407,
+      "grad_norm": 0.7608093023300171,
+      "learning_rate": 0.0002,
+      "loss": 0.7354,
+      "step": 2910
+    },
+    {
+      "epoch": 3.409223584354933,
+      "grad_norm": 0.9388294816017151,
+      "learning_rate": 0.0002,
+      "loss": 0.7098,
+      "step": 2920
+    },
+    {
+      "epoch": 3.420899007589025,
+      "grad_norm": 0.3589126169681549,
+      "learning_rate": 0.0002,
+      "loss": 0.693,
+      "step": 2930
+    },
+    {
+      "epoch": 3.4325744308231174,
+      "grad_norm": 0.5550422668457031,
+      "learning_rate": 0.0002,
+      "loss": 0.6969,
+      "step": 2940
+    },
+    {
+      "epoch": 3.4442498540572095,
+      "grad_norm": 0.7917458415031433,
+      "learning_rate": 0.0002,
+      "loss": 0.7128,
+      "step": 2950
+    },
+    {
+      "epoch": 3.4559252772913016,
+      "grad_norm": 0.3488539159297943,
+      "learning_rate": 0.0002,
+      "loss": 0.6868,
+      "step": 2960
+    },
+    {
+      "epoch": 3.467600700525394,
+      "grad_norm": 0.6213740110397339,
+      "learning_rate": 0.0002,
+      "loss": 0.6785,
+      "step": 2970
+    },
+    {
+      "epoch": 3.4792761237594863,
+      "grad_norm": 0.3689104914665222,
+      "learning_rate": 0.0002,
+      "loss": 0.6378,
+      "step": 2980
+    },
+    {
+      "epoch": 3.4909515469935783,
+      "grad_norm": 0.5803744196891785,
+      "learning_rate": 0.0002,
+      "loss": 0.6716,
+      "step": 2990
+    },
+    {
+      "epoch": 3.502626970227671,
+      "grad_norm": 0.40382176637649536,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 3000
+    },
+    {
+      "epoch": 3.514302393461763,
+      "grad_norm": 0.4239708185195923,
+      "learning_rate": 0.0002,
+      "loss": 0.6477,
+      "step": 3010
+    },
+    {
+      "epoch": 3.525977816695855,
+      "grad_norm": 0.7985871434211731,
+      "learning_rate": 0.0002,
+      "loss": 0.7277,
+      "step": 3020
+    },
+    {
+      "epoch": 3.5376532399299476,
+      "grad_norm": 0.34479430317878723,
+      "learning_rate": 0.0002,
+      "loss": 0.648,
+      "step": 3030
+    },
+    {
+      "epoch": 3.5493286631640397,
+      "grad_norm": 0.4610958993434906,
+      "learning_rate": 0.0002,
+      "loss": 0.7228,
+      "step": 3040
+    },
+    {
+      "epoch": 3.561004086398132,
+      "grad_norm": 0.7382773756980896,
+      "learning_rate": 0.0002,
+      "loss": 0.6823,
+      "step": 3050
+    },
+    {
+      "epoch": 3.5726795096322244,
+      "grad_norm": 1.0945557355880737,
+      "learning_rate": 0.0002,
+      "loss": 0.8486,
+      "step": 3060
+    },
+    {
+      "epoch": 3.5843549328663165,
+      "grad_norm": 0.7687262892723083,
+      "learning_rate": 0.0002,
+      "loss": 0.6702,
+      "step": 3070
+    },
+    {
+      "epoch": 3.5960303561004086,
+      "grad_norm": 0.4160486161708832,
+      "learning_rate": 0.0002,
+      "loss": 0.6835,
+      "step": 3080
+    },
+    {
+      "epoch": 3.607705779334501,
+      "grad_norm": 0.41337689757347107,
+      "learning_rate": 0.0002,
+      "loss": 0.6368,
+      "step": 3090
+    },
+    {
+      "epoch": 3.619381202568593,
+      "grad_norm": 0.6645044684410095,
+      "learning_rate": 0.0002,
+      "loss": 0.7291,
+      "step": 3100
+    },
+    {
+      "epoch": 3.6310566258026853,
+      "grad_norm": 0.6168597936630249,
+      "learning_rate": 0.0002,
+      "loss": 0.7521,
+      "step": 3110
+    },
+    {
+      "epoch": 3.642732049036778,
+      "grad_norm": 0.3842915892601013,
+      "learning_rate": 0.0002,
+      "loss": 0.7084,
+      "step": 3120
+    },
+    {
+      "epoch": 3.65440747227087,
+      "grad_norm": 0.8411704897880554,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 3130
+    },
+    {
+      "epoch": 3.666082895504962,
+      "grad_norm": 0.35331106185913086,
+      "learning_rate": 0.0002,
+      "loss": 0.7395,
+      "step": 3140
+    },
+    {
+      "epoch": 3.6777583187390546,
+      "grad_norm": 0.36399585008621216,
+      "learning_rate": 0.0002,
+      "loss": 0.6679,
+      "step": 3150
+    },
+    {
+      "epoch": 3.6894337419731467,
+      "grad_norm": 0.5143176913261414,
+      "learning_rate": 0.0002,
+      "loss": 0.7017,
+      "step": 3160
+    },
+    {
+      "epoch": 3.7011091652072388,
+      "grad_norm": 0.39592796564102173,
+      "learning_rate": 0.0002,
+      "loss": 0.7013,
+      "step": 3170
+    },
+    {
+      "epoch": 3.712784588441331,
+      "grad_norm": 0.5706851482391357,
+      "learning_rate": 0.0002,
+      "loss": 0.684,
+      "step": 3180
+    },
+    {
+      "epoch": 3.7244600116754234,
+      "grad_norm": 0.569911539554596,
+      "learning_rate": 0.0002,
+      "loss": 0.7194,
+      "step": 3190
+    },
+    {
+      "epoch": 3.7361354349095155,
+      "grad_norm": 0.4189624488353729,
+      "learning_rate": 0.0002,
+      "loss": 0.6366,
+      "step": 3200
+    },
+    {
+      "epoch": 3.7478108581436076,
+      "grad_norm": 0.3690492808818817,
+      "learning_rate": 0.0002,
+      "loss": 0.922,
+      "step": 3210
+    },
+    {
+      "epoch": 3.7594862813776997,
+      "grad_norm": 0.3477407693862915,
+      "learning_rate": 0.0002,
+      "loss": 0.734,
+      "step": 3220
+    },
+    {
+      "epoch": 3.7711617046117922,
+      "grad_norm": 0.36035019159317017,
+      "learning_rate": 0.0002,
+      "loss": 0.6657,
+      "step": 3230
+    },
+    {
+      "epoch": 3.7828371278458843,
+      "grad_norm": 0.4171631634235382,
+      "learning_rate": 0.0002,
+      "loss": 0.756,
+      "step": 3240
+    },
+    {
+      "epoch": 3.7945125510799764,
+      "grad_norm": 0.44240522384643555,
+      "learning_rate": 0.0002,
+      "loss": 0.6906,
+      "step": 3250
+    },
+    {
+      "epoch": 3.806187974314069,
+      "grad_norm": 0.3441826403141022,
+      "learning_rate": 0.0002,
+      "loss": 0.71,
+      "step": 3260
+    },
+    {
+      "epoch": 3.817863397548161,
+      "grad_norm": 0.605574905872345,
+      "learning_rate": 0.0002,
+      "loss": 0.6489,
+      "step": 3270
+    },
+    {
+      "epoch": 3.829538820782253,
+      "grad_norm": 0.41194725036621094,
+      "learning_rate": 0.0002,
+      "loss": 0.6945,
+      "step": 3280
+    },
+    {
+      "epoch": 3.8412142440163457,
+      "grad_norm": 0.3915793299674988,
+      "learning_rate": 0.0002,
+      "loss": 0.7075,
+      "step": 3290
+    },
+    {
+      "epoch": 3.852889667250438,
+      "grad_norm": 0.42300915718078613,
+      "learning_rate": 0.0002,
+      "loss": 0.7471,
+      "step": 3300
+    },
+    {
+      "epoch": 3.86456509048453,
+      "grad_norm": 0.9568390846252441,
+      "learning_rate": 0.0002,
+      "loss": 0.725,
+      "step": 3310
+    },
+    {
+      "epoch": 3.8762405137186224,
+      "grad_norm": 0.3875989615917206,
+      "learning_rate": 0.0002,
+      "loss": 0.6318,
+      "step": 3320
+    },
+    {
+      "epoch": 3.8879159369527145,
+      "grad_norm": 0.41842904686927795,
+      "learning_rate": 0.0002,
+      "loss": 0.6571,
+      "step": 3330
+    },
+    {
+      "epoch": 3.8995913601868066,
+      "grad_norm": 0.38325223326683044,
+      "learning_rate": 0.0002,
+      "loss": 0.6879,
+      "step": 3340
+    },
+    {
+      "epoch": 3.911266783420899,
+      "grad_norm": 0.7474912405014038,
+      "learning_rate": 0.0002,
+      "loss": 0.7766,
+      "step": 3350
+    },
+    {
+      "epoch": 3.9229422066549913,
+      "grad_norm": 0.47073325514793396,
+      "learning_rate": 0.0002,
+      "loss": 0.7253,
+      "step": 3360
+    },
+    {
+      "epoch": 3.9346176298890834,
+      "grad_norm": 0.5380847454071045,
+      "learning_rate": 0.0002,
+      "loss": 0.7141,
+      "step": 3370
+    },
+    {
+      "epoch": 3.946293053123176,
+      "grad_norm": 0.4632789194583893,
+      "learning_rate": 0.0002,
+      "loss": 0.689,
+      "step": 3380
+    },
+    {
+      "epoch": 3.957968476357268,
+      "grad_norm": 0.548065185546875,
+      "learning_rate": 0.0002,
+      "loss": 0.6715,
+      "step": 3390
+    },
+    {
+      "epoch": 3.96964389959136,
+      "grad_norm": 0.45601144433021545,
+      "learning_rate": 0.0002,
+      "loss": 0.711,
+      "step": 3400
+    },
+    {
+      "epoch": 3.9813193228254526,
+      "grad_norm": 0.3866109848022461,
+      "learning_rate": 0.0002,
+      "loss": 0.6989,
+      "step": 3410
+    },
+    {
+      "epoch": 3.9929947460595447,
+      "grad_norm": 0.6861770749092102,
+      "learning_rate": 0.0002,
+      "loss": 0.6998,
+      "step": 3420
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.2217445373535156,
+      "eval_runtime": 39.1717,
+      "eval_samples_per_second": 11.692,
+      "eval_steps_per_second": 1.481,
+      "step": 3426
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6848,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4925677734762906e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb077d5937aba7d77271acaac7bb330e71c022289d873a61f3df9490020d208d
+size 5688

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86faf97492c3f319442ec6871c738db806b633d8344a7f981b6a85f46ef3ff76
+size 80755416