MilaWang commited on Mar 28, 2025

Commit

c8ff779

verified ·

1 Parent(s): c218d7d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/tokenizer.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/trainer_state.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-10296/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/tokenizer.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/trainer_state.json +937 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/added_tokens.json +5 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/merges.txt +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/optimizer.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/rng_state.pth +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/scheduler.pt +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/special_tokens_map.json +14 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/tokenizer.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/tokenizer_config.json +43 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/trainer_state.json +1848 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/training_args.bin +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574/vocab.json +0 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-3861/README.md +202 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-3861/adapter_config.json +29 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-3861/adapter_model.safetensors +3 -0
Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-3861/added_tokens.json +5 -0

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5b46ee00f8c14ad16ad37579ebaffb4bd9154b8368b81822719b9da300ef108
+size 80755416

Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c04d1eff5ea81e64a82f8a1073e0460f7ec58a5c87d454ba21c8dc15309817a6
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a94869da3feab0cc6c884545a258fbf3a05927df67a1afb4f85fdcf82427c1d
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:889be2c30eced8c483cafd8691c5fafae350b82890aad7a2bdfbac747695a5c6
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4fbac8224393fecc9e860daccad502ed1ad004255977260e4a1c58acfefbee1
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b36338332cddabc7310f2540f453242087f16525e09249c71d1000e9b851228
+size 5560

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e210983c09cde1638d0cde4a621b520fb06a4c2da2a28b7999ceb685d45d45e
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aea9881d86ae737a50d9aa1097a20ffba594d9c81945b961f1875ee763d95519
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bab7f01a35c8f0b8ab099fdcfc9da8e8f5b1eca53fba024d374e30adffbc960
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fd0fec07cf84fbcf48e2e02a2db554dd5c13afad429460b9c4cbc81650c3635
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

	@@ -0,0 +1,937 @@

+{
+  "best_metric": 1.129408359527588,
+  "best_model_checkpoint": "outputs-001/Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-1287",
+  "epoch": 1.0,
+  "eval_steps": 10,
+  "global_step": 1287,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00777000777000777,
+      "grad_norm": 0.30297431349754333,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 10
+    },
+    {
+      "epoch": 0.01554001554001554,
+      "grad_norm": 0.2484888881444931,
+      "learning_rate": 0.0002,
+      "loss": 1.4534,
+      "step": 20
+    },
+    {
+      "epoch": 0.023310023310023312,
+      "grad_norm": 0.23519672453403473,
+      "learning_rate": 0.0002,
+      "loss": 1.3485,
+      "step": 30
+    },
+    {
+      "epoch": 0.03108003108003108,
+      "grad_norm": 0.32419469952583313,
+      "learning_rate": 0.0002,
+      "loss": 1.2596,
+      "step": 40
+    },
+    {
+      "epoch": 0.03885003885003885,
+      "grad_norm": 0.29397228360176086,
+      "learning_rate": 0.0002,
+      "loss": 1.1727,
+      "step": 50
+    },
+    {
+      "epoch": 0.046620046620046623,
+      "grad_norm": 0.21554380655288696,
+      "learning_rate": 0.0002,
+      "loss": 1.244,
+      "step": 60
+    },
+    {
+      "epoch": 0.05439005439005439,
+      "grad_norm": 0.24259765446186066,
+      "learning_rate": 0.0002,
+      "loss": 1.1753,
+      "step": 70
+    },
+    {
+      "epoch": 0.06216006216006216,
+      "grad_norm": 0.24977324903011322,
+      "learning_rate": 0.0002,
+      "loss": 1.1667,
+      "step": 80
+    },
+    {
+      "epoch": 0.06993006993006994,
+      "grad_norm": 0.24173440039157867,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 90
+    },
+    {
+      "epoch": 0.0777000777000777,
+      "grad_norm": 0.22682763636112213,
+      "learning_rate": 0.0002,
+      "loss": 1.2292,
+      "step": 100
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.2315922975540161,
+      "learning_rate": 0.0002,
+      "loss": 1.0986,
+      "step": 110
+    },
+    {
+      "epoch": 0.09324009324009325,
+      "grad_norm": 0.2465641051530838,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 120
+    },
+    {
+      "epoch": 0.10101010101010101,
+      "grad_norm": 0.2049807608127594,
+      "learning_rate": 0.0002,
+      "loss": 1.123,
+      "step": 130
+    },
+    {
+      "epoch": 0.10878010878010878,
+      "grad_norm": 0.2138686627149582,
+      "learning_rate": 0.0002,
+      "loss": 1.1161,
+      "step": 140
+    },
+    {
+      "epoch": 0.11655011655011654,
+      "grad_norm": 0.24715466797351837,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 150
+    },
+    {
+      "epoch": 0.12432012432012432,
+      "grad_norm": 0.2159343808889389,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 160
+    },
+    {
+      "epoch": 0.1320901320901321,
+      "grad_norm": 0.22828488051891327,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 170
+    },
+    {
+      "epoch": 0.13986013986013987,
+      "grad_norm": 0.17286927998065948,
+      "learning_rate": 0.0002,
+      "loss": 1.1639,
+      "step": 180
+    },
+    {
+      "epoch": 0.14763014763014762,
+      "grad_norm": 0.22565744817256927,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 190
+    },
+    {
+      "epoch": 0.1554001554001554,
+      "grad_norm": 0.26638466119766235,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 200
+    },
+    {
+      "epoch": 0.16317016317016317,
+      "grad_norm": 0.2293207049369812,
+      "learning_rate": 0.0002,
+      "loss": 1.1187,
+      "step": 210
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.2655271291732788,
+      "learning_rate": 0.0002,
+      "loss": 1.185,
+      "step": 220
+    },
+    {
+      "epoch": 0.17871017871017872,
+      "grad_norm": 0.22603964805603027,
+      "learning_rate": 0.0002,
+      "loss": 1.2833,
+      "step": 230
+    },
+    {
+      "epoch": 0.1864801864801865,
+      "grad_norm": 0.23628494143486023,
+      "learning_rate": 0.0002,
+      "loss": 1.147,
+      "step": 240
+    },
+    {
+      "epoch": 0.19425019425019424,
+      "grad_norm": 0.21186968684196472,
+      "learning_rate": 0.0002,
+      "loss": 1.0686,
+      "step": 250
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 0.19292838871479034,
+      "learning_rate": 0.0002,
+      "loss": 1.2727,
+      "step": 260
+    },
+    {
+      "epoch": 0.2097902097902098,
+      "grad_norm": 0.28150689601898193,
+      "learning_rate": 0.0002,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.21756021756021757,
+      "grad_norm": 0.23384647071361542,
+      "learning_rate": 0.0002,
+      "loss": 1.0772,
+      "step": 280
+    },
+    {
+      "epoch": 0.22533022533022534,
+      "grad_norm": 0.20646218955516815,
+      "learning_rate": 0.0002,
+      "loss": 1.1422,
+      "step": 290
+    },
+    {
+      "epoch": 0.2331002331002331,
+      "grad_norm": 0.2451605200767517,
+      "learning_rate": 0.0002,
+      "loss": 1.1577,
+      "step": 300
+    },
+    {
+      "epoch": 0.24087024087024086,
+      "grad_norm": 0.19848696887493134,
+      "learning_rate": 0.0002,
+      "loss": 1.1144,
+      "step": 310
+    },
+    {
+      "epoch": 0.24864024864024864,
+      "grad_norm": 0.24066492915153503,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 320
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.2493826001882553,
+      "learning_rate": 0.0002,
+      "loss": 1.0673,
+      "step": 330
+    },
+    {
+      "epoch": 0.2641802641802642,
+      "grad_norm": 0.22245089709758759,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 340
+    },
+    {
+      "epoch": 0.27195027195027194,
+      "grad_norm": 0.24811948835849762,
+      "learning_rate": 0.0002,
+      "loss": 1.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.27972027972027974,
+      "grad_norm": 0.1916377991437912,
+      "learning_rate": 0.0002,
+      "loss": 1.2911,
+      "step": 360
+    },
+    {
+      "epoch": 0.2874902874902875,
+      "grad_norm": 0.2213028222322464,
+      "learning_rate": 0.0002,
+      "loss": 1.0964,
+      "step": 370
+    },
+    {
+      "epoch": 0.29526029526029524,
+      "grad_norm": 0.2754485011100769,
+      "learning_rate": 0.0002,
+      "loss": 1.0724,
+      "step": 380
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.2230151891708374,
+      "learning_rate": 0.0002,
+      "loss": 1.2163,
+      "step": 390
+    },
+    {
+      "epoch": 0.3108003108003108,
+      "grad_norm": 0.2000817507505417,
+      "learning_rate": 0.0002,
+      "loss": 1.1313,
+      "step": 400
+    },
+    {
+      "epoch": 0.3185703185703186,
+      "grad_norm": 0.18378672003746033,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 410
+    },
+    {
+      "epoch": 0.32634032634032634,
+      "grad_norm": 0.2163156419992447,
+      "learning_rate": 0.0002,
+      "loss": 1.052,
+      "step": 420
+    },
+    {
+      "epoch": 0.3341103341103341,
+      "grad_norm": 0.22171632945537567,
+      "learning_rate": 0.0002,
+      "loss": 0.9625,
+      "step": 430
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.19466277956962585,
+      "learning_rate": 0.0002,
+      "loss": 1.1991,
+      "step": 440
+    },
+    {
+      "epoch": 0.34965034965034963,
+      "grad_norm": 0.24126316606998444,
+      "learning_rate": 0.0002,
+      "loss": 1.1263,
+      "step": 450
+    },
+    {
+      "epoch": 0.35742035742035744,
+      "grad_norm": 0.20549152791500092,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 460
+    },
+    {
+      "epoch": 0.3651903651903652,
+      "grad_norm": 0.31305992603302,
+      "learning_rate": 0.0002,
+      "loss": 1.241,
+      "step": 470
+    },
+    {
+      "epoch": 0.372960372960373,
+      "grad_norm": 0.23699617385864258,
+      "learning_rate": 0.0002,
+      "loss": 1.1476,
+      "step": 480
+    },
+    {
+      "epoch": 0.38073038073038074,
+      "grad_norm": 0.18351434171199799,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 490
+    },
+    {
+      "epoch": 0.3885003885003885,
+      "grad_norm": 0.19032520055770874,
+      "learning_rate": 0.0002,
+      "loss": 1.1624,
+      "step": 500
+    },
+    {
+      "epoch": 0.3962703962703963,
+      "grad_norm": 0.1658385992050171,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 510
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 0.18995201587677002,
+      "learning_rate": 0.0002,
+      "loss": 0.9975,
+      "step": 520
+    },
+    {
+      "epoch": 0.41181041181041184,
+      "grad_norm": 0.19139058887958527,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 530
+    },
+    {
+      "epoch": 0.4195804195804196,
+      "grad_norm": 0.24339012801647186,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 540
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.31135988235473633,
+      "learning_rate": 0.0002,
+      "loss": 1.1619,
+      "step": 550
+    },
+    {
+      "epoch": 0.43512043512043513,
+      "grad_norm": 0.2381313443183899,
+      "learning_rate": 0.0002,
+      "loss": 1.1504,
+      "step": 560
+    },
+    {
+      "epoch": 0.4428904428904429,
+      "grad_norm": 0.22092527151107788,
+      "learning_rate": 0.0002,
+      "loss": 1.1847,
+      "step": 570
+    },
+    {
+      "epoch": 0.4506604506604507,
+      "grad_norm": 0.17754223942756653,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 580
+    },
+    {
+      "epoch": 0.45843045843045843,
+      "grad_norm": 0.20770515501499176,
+      "learning_rate": 0.0002,
+      "loss": 1.1371,
+      "step": 590
+    },
+    {
+      "epoch": 0.4662004662004662,
+      "grad_norm": 0.1945110708475113,
+      "learning_rate": 0.0002,
+      "loss": 1.1105,
+      "step": 600
+    },
+    {
+      "epoch": 0.473970473970474,
+      "grad_norm": 0.1895918995141983,
+      "learning_rate": 0.0002,
+      "loss": 1.0277,
+      "step": 610
+    },
+    {
+      "epoch": 0.48174048174048173,
+      "grad_norm": 0.2508707642555237,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 620
+    },
+    {
+      "epoch": 0.48951048951048953,
+      "grad_norm": 0.22974248230457306,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 630
+    },
+    {
+      "epoch": 0.4972804972804973,
+      "grad_norm": 0.23621833324432373,
+      "learning_rate": 0.0002,
+      "loss": 1.1523,
+      "step": 640
+    },
+    {
+      "epoch": 0.5050505050505051,
+      "grad_norm": 0.19252857565879822,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 650
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.19978870451450348,
+      "learning_rate": 0.0002,
+      "loss": 1.0326,
+      "step": 660
+    },
+    {
+      "epoch": 0.5205905205905206,
+      "grad_norm": 0.6729635000228882,
+      "learning_rate": 0.0002,
+      "loss": 1.1651,
+      "step": 670
+    },
+    {
+      "epoch": 0.5283605283605284,
+      "grad_norm": 0.3017582893371582,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 680
+    },
+    {
+      "epoch": 0.5361305361305362,
+      "grad_norm": 0.20520491898059845,
+      "learning_rate": 0.0002,
+      "loss": 1.1229,
+      "step": 690
+    },
+    {
+      "epoch": 0.5439005439005439,
+      "grad_norm": 0.21933147311210632,
+      "learning_rate": 0.0002,
+      "loss": 1.1065,
+      "step": 700
+    },
+    {
+      "epoch": 0.5516705516705517,
+      "grad_norm": 0.21090354025363922,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 710
+    },
+    {
+      "epoch": 0.5594405594405595,
+      "grad_norm": 0.2536642551422119,
+      "learning_rate": 0.0002,
+      "loss": 1.2321,
+      "step": 720
+    },
+    {
+      "epoch": 0.5672105672105672,
+      "grad_norm": 0.22613082826137543,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 730
+    },
+    {
+      "epoch": 0.574980574980575,
+      "grad_norm": 0.20228610932826996,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 740
+    },
+    {
+      "epoch": 0.5827505827505828,
+      "grad_norm": 0.1765434592962265,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 750
+    },
+    {
+      "epoch": 0.5905205905205905,
+      "grad_norm": 0.20663489401340485,
+      "learning_rate": 0.0002,
+      "loss": 1.0774,
+      "step": 760
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.19824334979057312,
+      "learning_rate": 0.0002,
+      "loss": 1.1889,
+      "step": 770
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.20710207521915436,
+      "learning_rate": 0.0002,
+      "loss": 1.0703,
+      "step": 780
+    },
+    {
+      "epoch": 0.6138306138306139,
+      "grad_norm": 0.15639153122901917,
+      "learning_rate": 0.0002,
+      "loss": 1.1448,
+      "step": 790
+    },
+    {
+      "epoch": 0.6216006216006216,
+      "grad_norm": 0.29030027985572815,
+      "learning_rate": 0.0002,
+      "loss": 1.093,
+      "step": 800
+    },
+    {
+      "epoch": 0.6293706293706294,
+      "grad_norm": 0.21650588512420654,
+      "learning_rate": 0.0002,
+      "loss": 1.0669,
+      "step": 810
+    },
+    {
+      "epoch": 0.6371406371406372,
+      "grad_norm": 0.5422983169555664,
+      "learning_rate": 0.0002,
+      "loss": 1.1659,
+      "step": 820
+    },
+    {
+      "epoch": 0.6449106449106449,
+      "grad_norm": 0.2035626322031021,
+      "learning_rate": 0.0002,
+      "loss": 1.0996,
+      "step": 830
+    },
+    {
+      "epoch": 0.6526806526806527,
+      "grad_norm": 0.23480531573295593,
+      "learning_rate": 0.0002,
+      "loss": 1.2308,
+      "step": 840
+    },
+    {
+      "epoch": 0.6604506604506605,
+      "grad_norm": 0.19458115100860596,
+      "learning_rate": 0.0002,
+      "loss": 1.2648,
+      "step": 850
+    },
+    {
+      "epoch": 0.6682206682206682,
+      "grad_norm": 0.2670581638813019,
+      "learning_rate": 0.0002,
+      "loss": 1.0638,
+      "step": 860
+    },
+    {
+      "epoch": 0.675990675990676,
+      "grad_norm": 0.1925385743379593,
+      "learning_rate": 0.0002,
+      "loss": 1.1718,
+      "step": 870
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.23513855040073395,
+      "learning_rate": 0.0002,
+      "loss": 1.068,
+      "step": 880
+    },
+    {
+      "epoch": 0.6915306915306916,
+      "grad_norm": 0.2000550478696823,
+      "learning_rate": 0.0002,
+      "loss": 1.2692,
+      "step": 890
+    },
+    {
+      "epoch": 0.6993006993006993,
+      "grad_norm": 0.26091018319129944,
+      "learning_rate": 0.0002,
+      "loss": 1.1071,
+      "step": 900
+    },
+    {
+      "epoch": 0.7070707070707071,
+      "grad_norm": 0.18439704179763794,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 910
+    },
+    {
+      "epoch": 0.7148407148407149,
+      "grad_norm": 0.2290486991405487,
+      "learning_rate": 0.0002,
+      "loss": 1.1189,
+      "step": 920
+    },
+    {
+      "epoch": 0.7226107226107226,
+      "grad_norm": 0.2338181883096695,
+      "learning_rate": 0.0002,
+      "loss": 1.1858,
+      "step": 930
+    },
+    {
+      "epoch": 0.7303807303807304,
+      "grad_norm": 0.19802013039588928,
+      "learning_rate": 0.0002,
+      "loss": 0.9945,
+      "step": 940
+    },
+    {
+      "epoch": 0.7381507381507382,
+      "grad_norm": 0.19329555332660675,
+      "learning_rate": 0.0002,
+      "loss": 0.9957,
+      "step": 950
+    },
+    {
+      "epoch": 0.745920745920746,
+      "grad_norm": 0.22874726355075836,
+      "learning_rate": 0.0002,
+      "loss": 1.0822,
+      "step": 960
+    },
+    {
+      "epoch": 0.7536907536907537,
+      "grad_norm": 0.24897505342960358,
+      "learning_rate": 0.0002,
+      "loss": 1.1034,
+      "step": 970
+    },
+    {
+      "epoch": 0.7614607614607615,
+      "grad_norm": 0.19370736181735992,
+      "learning_rate": 0.0002,
+      "loss": 1.1134,
+      "step": 980
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.18071319162845612,
+      "learning_rate": 0.0002,
+      "loss": 1.0785,
+      "step": 990
+    },
+    {
+      "epoch": 0.777000777000777,
+      "grad_norm": 0.941108226776123,
+      "learning_rate": 0.0002,
+      "loss": 1.1866,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7847707847707848,
+      "grad_norm": 0.21083903312683105,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7925407925407926,
+      "grad_norm": 0.2541967034339905,
+      "learning_rate": 0.0002,
+      "loss": 1.1665,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8003108003108003,
+      "grad_norm": 0.2519970238208771,
+      "learning_rate": 0.0002,
+      "loss": 1.0513,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.23185153305530548,
+      "learning_rate": 0.0002,
+      "loss": 1.0795,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8158508158508159,
+      "grad_norm": 0.16371922194957733,
+      "learning_rate": 0.0002,
+      "loss": 1.0451,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8236208236208237,
+      "grad_norm": 0.23721691966056824,
+      "learning_rate": 0.0002,
+      "loss": 1.2314,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8313908313908314,
+      "grad_norm": 0.28295132517814636,
+      "learning_rate": 0.0002,
+      "loss": 1.1663,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8391608391608392,
+      "grad_norm": 0.2982921004295349,
+      "learning_rate": 0.0002,
+      "loss": 1.0625,
+      "step": 1080
+    },
+    {
+      "epoch": 0.846930846930847,
+      "grad_norm": 0.2634515166282654,
+      "learning_rate": 0.0002,
+      "loss": 1.1936,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.2118266075849533,
+      "learning_rate": 0.0002,
+      "loss": 1.2291,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8624708624708625,
+      "grad_norm": 0.2321753352880478,
+      "learning_rate": 0.0002,
+      "loss": 1.1269,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8702408702408703,
+      "grad_norm": 0.1881166696548462,
+      "learning_rate": 0.0002,
+      "loss": 1.122,
+      "step": 1120
+    },
+    {
+      "epoch": 0.878010878010878,
+      "grad_norm": 0.1806720793247223,
+      "learning_rate": 0.0002,
+      "loss": 1.2244,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8857808857808858,
+      "grad_norm": 0.2697276473045349,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8935508935508936,
+      "grad_norm": 0.1976407915353775,
+      "learning_rate": 0.0002,
+      "loss": 1.0894,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9013209013209014,
+      "grad_norm": 0.2299172729253769,
+      "learning_rate": 0.0002,
+      "loss": 1.1296,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.2056407332420349,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9168609168609169,
+      "grad_norm": 0.2531585395336151,
+      "learning_rate": 0.0002,
+      "loss": 0.9501,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9246309246309247,
+      "grad_norm": 0.21540220081806183,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9324009324009324,
+      "grad_norm": 0.23623155057430267,
+      "learning_rate": 0.0002,
+      "loss": 1.2535,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.15985913574695587,
+      "learning_rate": 0.0002,
+      "loss": 1.2165,
+      "step": 1210
+    },
+    {
+      "epoch": 0.947940947940948,
+      "grad_norm": 0.21025371551513672,
+      "learning_rate": 0.0002,
+      "loss": 1.1809,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9557109557109557,
+      "grad_norm": 0.21953997015953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9634809634809635,
+      "grad_norm": 0.2306654453277588,
+      "learning_rate": 0.0002,
+      "loss": 1.0345,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9712509712509713,
+      "grad_norm": 0.18655075132846832,
+      "learning_rate": 0.0002,
+      "loss": 1.168,
+      "step": 1250
+    },
+    {
+      "epoch": 0.9790209790209791,
+      "grad_norm": 0.1910385638475418,
+      "learning_rate": 0.0002,
+      "loss": 1.1434,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9867909867909868,
+      "grad_norm": 0.1794031411409378,
+      "learning_rate": 0.0002,
+      "loss": 1.1564,
+      "step": 1270
+    },
+    {
+      "epoch": 0.9945609945609946,
+      "grad_norm": 0.21666628122329712,
+      "learning_rate": 0.0002,
+      "loss": 1.147,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.129408359527588,
+      "eval_runtime": 126.1267,
+      "eval_samples_per_second": 3.631,
+      "eval_steps_per_second": 0.46,
+      "step": 1287
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10296,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.606931478295347e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b36338332cddabc7310f2540f453242087f16525e09249c71d1000e9b851228
+size 5560

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5b46ee00f8c14ad16ad37579ebaffb4bd9154b8368b81822719b9da300ef108
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:141adbdbf7206762134e9e74bf6d67e4624b7e51428613c37c6a74e81aff0373
+size 41136570

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf6e1ffffb8fb08a2ec2044747b49adb74477cff0d07942fb0f093fd431cc90e
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18713c13e64b493fc29d5242e56c9cdd955afa4e9ead4166d2be2ea6219dc092
+size 1064

	@@ -0,0 +1,14 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

	@@ -0,0 +1,1848 @@

+{
+  "best_metric": 1.110044240951538,
+  "best_model_checkpoint": "outputs-001/Qwen2-7B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-7847-sd-42/checkpoint-2574",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 2574,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00777000777000777,
+      "grad_norm": 0.30297431349754333,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 10
+    },
+    {
+      "epoch": 0.01554001554001554,
+      "grad_norm": 0.2484888881444931,
+      "learning_rate": 0.0002,
+      "loss": 1.4534,
+      "step": 20
+    },
+    {
+      "epoch": 0.023310023310023312,
+      "grad_norm": 0.23519672453403473,
+      "learning_rate": 0.0002,
+      "loss": 1.3485,
+      "step": 30
+    },
+    {
+      "epoch": 0.03108003108003108,
+      "grad_norm": 0.32419469952583313,
+      "learning_rate": 0.0002,
+      "loss": 1.2596,
+      "step": 40
+    },
+    {
+      "epoch": 0.03885003885003885,
+      "grad_norm": 0.29397228360176086,
+      "learning_rate": 0.0002,
+      "loss": 1.1727,
+      "step": 50
+    },
+    {
+      "epoch": 0.046620046620046623,
+      "grad_norm": 0.21554380655288696,
+      "learning_rate": 0.0002,
+      "loss": 1.244,
+      "step": 60
+    },
+    {
+      "epoch": 0.05439005439005439,
+      "grad_norm": 0.24259765446186066,
+      "learning_rate": 0.0002,
+      "loss": 1.1753,
+      "step": 70
+    },
+    {
+      "epoch": 0.06216006216006216,
+      "grad_norm": 0.24977324903011322,
+      "learning_rate": 0.0002,
+      "loss": 1.1667,
+      "step": 80
+    },
+    {
+      "epoch": 0.06993006993006994,
+      "grad_norm": 0.24173440039157867,
+      "learning_rate": 0.0002,
+      "loss": 1.1636,
+      "step": 90
+    },
+    {
+      "epoch": 0.0777000777000777,
+      "grad_norm": 0.22682763636112213,
+      "learning_rate": 0.0002,
+      "loss": 1.2292,
+      "step": 100
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 0.2315922975540161,
+      "learning_rate": 0.0002,
+      "loss": 1.0986,
+      "step": 110
+    },
+    {
+      "epoch": 0.09324009324009325,
+      "grad_norm": 0.2465641051530838,
+      "learning_rate": 0.0002,
+      "loss": 1.1294,
+      "step": 120
+    },
+    {
+      "epoch": 0.10101010101010101,
+      "grad_norm": 0.2049807608127594,
+      "learning_rate": 0.0002,
+      "loss": 1.123,
+      "step": 130
+    },
+    {
+      "epoch": 0.10878010878010878,
+      "grad_norm": 0.2138686627149582,
+      "learning_rate": 0.0002,
+      "loss": 1.1161,
+      "step": 140
+    },
+    {
+      "epoch": 0.11655011655011654,
+      "grad_norm": 0.24715466797351837,
+      "learning_rate": 0.0002,
+      "loss": 1.1465,
+      "step": 150
+    },
+    {
+      "epoch": 0.12432012432012432,
+      "grad_norm": 0.2159343808889389,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 160
+    },
+    {
+      "epoch": 0.1320901320901321,
+      "grad_norm": 0.22828488051891327,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 170
+    },
+    {
+      "epoch": 0.13986013986013987,
+      "grad_norm": 0.17286927998065948,
+      "learning_rate": 0.0002,
+      "loss": 1.1639,
+      "step": 180
+    },
+    {
+      "epoch": 0.14763014763014762,
+      "grad_norm": 0.22565744817256927,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 190
+    },
+    {
+      "epoch": 0.1554001554001554,
+      "grad_norm": 0.26638466119766235,
+      "learning_rate": 0.0002,
+      "loss": 1.1128,
+      "step": 200
+    },
+    {
+      "epoch": 0.16317016317016317,
+      "grad_norm": 0.2293207049369812,
+      "learning_rate": 0.0002,
+      "loss": 1.1187,
+      "step": 210
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 0.2655271291732788,
+      "learning_rate": 0.0002,
+      "loss": 1.185,
+      "step": 220
+    },
+    {
+      "epoch": 0.17871017871017872,
+      "grad_norm": 0.22603964805603027,
+      "learning_rate": 0.0002,
+      "loss": 1.2833,
+      "step": 230
+    },
+    {
+      "epoch": 0.1864801864801865,
+      "grad_norm": 0.23628494143486023,
+      "learning_rate": 0.0002,
+      "loss": 1.147,
+      "step": 240
+    },
+    {
+      "epoch": 0.19425019425019424,
+      "grad_norm": 0.21186968684196472,
+      "learning_rate": 0.0002,
+      "loss": 1.0686,
+      "step": 250
+    },
+    {
+      "epoch": 0.20202020202020202,
+      "grad_norm": 0.19292838871479034,
+      "learning_rate": 0.0002,
+      "loss": 1.2727,
+      "step": 260
+    },
+    {
+      "epoch": 0.2097902097902098,
+      "grad_norm": 0.28150689601898193,
+      "learning_rate": 0.0002,
+      "loss": 1.087,
+      "step": 270
+    },
+    {
+      "epoch": 0.21756021756021757,
+      "grad_norm": 0.23384647071361542,
+      "learning_rate": 0.0002,
+      "loss": 1.0772,
+      "step": 280
+    },
+    {
+      "epoch": 0.22533022533022534,
+      "grad_norm": 0.20646218955516815,
+      "learning_rate": 0.0002,
+      "loss": 1.1422,
+      "step": 290
+    },
+    {
+      "epoch": 0.2331002331002331,
+      "grad_norm": 0.2451605200767517,
+      "learning_rate": 0.0002,
+      "loss": 1.1577,
+      "step": 300
+    },
+    {
+      "epoch": 0.24087024087024086,
+      "grad_norm": 0.19848696887493134,
+      "learning_rate": 0.0002,
+      "loss": 1.1144,
+      "step": 310
+    },
+    {
+      "epoch": 0.24864024864024864,
+      "grad_norm": 0.24066492915153503,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 320
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.2493826001882553,
+      "learning_rate": 0.0002,
+      "loss": 1.0673,
+      "step": 330
+    },
+    {
+      "epoch": 0.2641802641802642,
+      "grad_norm": 0.22245089709758759,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 340
+    },
+    {
+      "epoch": 0.27195027195027194,
+      "grad_norm": 0.24811948835849762,
+      "learning_rate": 0.0002,
+      "loss": 1.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.27972027972027974,
+      "grad_norm": 0.1916377991437912,
+      "learning_rate": 0.0002,
+      "loss": 1.2911,
+      "step": 360
+    },
+    {
+      "epoch": 0.2874902874902875,
+      "grad_norm": 0.2213028222322464,
+      "learning_rate": 0.0002,
+      "loss": 1.0964,
+      "step": 370
+    },
+    {
+      "epoch": 0.29526029526029524,
+      "grad_norm": 0.2754485011100769,
+      "learning_rate": 0.0002,
+      "loss": 1.0724,
+      "step": 380
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.2230151891708374,
+      "learning_rate": 0.0002,
+      "loss": 1.2163,
+      "step": 390
+    },
+    {
+      "epoch": 0.3108003108003108,
+      "grad_norm": 0.2000817507505417,
+      "learning_rate": 0.0002,
+      "loss": 1.1313,
+      "step": 400
+    },
+    {
+      "epoch": 0.3185703185703186,
+      "grad_norm": 0.18378672003746033,
+      "learning_rate": 0.0002,
+      "loss": 1.1837,
+      "step": 410
+    },
+    {
+      "epoch": 0.32634032634032634,
+      "grad_norm": 0.2163156419992447,
+      "learning_rate": 0.0002,
+      "loss": 1.052,
+      "step": 420
+    },
+    {
+      "epoch": 0.3341103341103341,
+      "grad_norm": 0.22171632945537567,
+      "learning_rate": 0.0002,
+      "loss": 0.9625,
+      "step": 430
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.19466277956962585,
+      "learning_rate": 0.0002,
+      "loss": 1.1991,
+      "step": 440
+    },
+    {
+      "epoch": 0.34965034965034963,
+      "grad_norm": 0.24126316606998444,
+      "learning_rate": 0.0002,
+      "loss": 1.1263,
+      "step": 450
+    },
+    {
+      "epoch": 0.35742035742035744,
+      "grad_norm": 0.20549152791500092,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 460
+    },
+    {
+      "epoch": 0.3651903651903652,
+      "grad_norm": 0.31305992603302,
+      "learning_rate": 0.0002,
+      "loss": 1.241,
+      "step": 470
+    },
+    {
+      "epoch": 0.372960372960373,
+      "grad_norm": 0.23699617385864258,
+      "learning_rate": 0.0002,
+      "loss": 1.1476,
+      "step": 480
+    },
+    {
+      "epoch": 0.38073038073038074,
+      "grad_norm": 0.18351434171199799,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 490
+    },
+    {
+      "epoch": 0.3885003885003885,
+      "grad_norm": 0.19032520055770874,
+      "learning_rate": 0.0002,
+      "loss": 1.1624,
+      "step": 500
+    },
+    {
+      "epoch": 0.3962703962703963,
+      "grad_norm": 0.1658385992050171,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 510
+    },
+    {
+      "epoch": 0.40404040404040403,
+      "grad_norm": 0.18995201587677002,
+      "learning_rate": 0.0002,
+      "loss": 0.9975,
+      "step": 520
+    },
+    {
+      "epoch": 0.41181041181041184,
+      "grad_norm": 0.19139058887958527,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 530
+    },
+    {
+      "epoch": 0.4195804195804196,
+      "grad_norm": 0.24339012801647186,
+      "learning_rate": 0.0002,
+      "loss": 1.0966,
+      "step": 540
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.31135988235473633,
+      "learning_rate": 0.0002,
+      "loss": 1.1619,
+      "step": 550
+    },
+    {
+      "epoch": 0.43512043512043513,
+      "grad_norm": 0.2381313443183899,
+      "learning_rate": 0.0002,
+      "loss": 1.1504,
+      "step": 560
+    },
+    {
+      "epoch": 0.4428904428904429,
+      "grad_norm": 0.22092527151107788,
+      "learning_rate": 0.0002,
+      "loss": 1.1847,
+      "step": 570
+    },
+    {
+      "epoch": 0.4506604506604507,
+      "grad_norm": 0.17754223942756653,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 580
+    },
+    {
+      "epoch": 0.45843045843045843,
+      "grad_norm": 0.20770515501499176,
+      "learning_rate": 0.0002,
+      "loss": 1.1371,
+      "step": 590
+    },
+    {
+      "epoch": 0.4662004662004662,
+      "grad_norm": 0.1945110708475113,
+      "learning_rate": 0.0002,
+      "loss": 1.1105,
+      "step": 600
+    },
+    {
+      "epoch": 0.473970473970474,
+      "grad_norm": 0.1895918995141983,
+      "learning_rate": 0.0002,
+      "loss": 1.0277,
+      "step": 610
+    },
+    {
+      "epoch": 0.48174048174048173,
+      "grad_norm": 0.2508707642555237,
+      "learning_rate": 0.0002,
+      "loss": 1.178,
+      "step": 620
+    },
+    {
+      "epoch": 0.48951048951048953,
+      "grad_norm": 0.22974248230457306,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 630
+    },
+    {
+      "epoch": 0.4972804972804973,
+      "grad_norm": 0.23621833324432373,
+      "learning_rate": 0.0002,
+      "loss": 1.1523,
+      "step": 640
+    },
+    {
+      "epoch": 0.5050505050505051,
+      "grad_norm": 0.19252857565879822,
+      "learning_rate": 0.0002,
+      "loss": 1.1975,
+      "step": 650
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.19978870451450348,
+      "learning_rate": 0.0002,
+      "loss": 1.0326,
+      "step": 660
+    },
+    {
+      "epoch": 0.5205905205905206,
+      "grad_norm": 0.6729635000228882,
+      "learning_rate": 0.0002,
+      "loss": 1.1651,
+      "step": 670
+    },
+    {
+      "epoch": 0.5283605283605284,
+      "grad_norm": 0.3017582893371582,
+      "learning_rate": 0.0002,
+      "loss": 1.1875,
+      "step": 680
+    },
+    {
+      "epoch": 0.5361305361305362,
+      "grad_norm": 0.20520491898059845,
+      "learning_rate": 0.0002,
+      "loss": 1.1229,
+      "step": 690
+    },
+    {
+      "epoch": 0.5439005439005439,
+      "grad_norm": 0.21933147311210632,
+      "learning_rate": 0.0002,
+      "loss": 1.1065,
+      "step": 700
+    },
+    {
+      "epoch": 0.5516705516705517,
+      "grad_norm": 0.21090354025363922,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 710
+    },
+    {
+      "epoch": 0.5594405594405595,
+      "grad_norm": 0.2536642551422119,
+      "learning_rate": 0.0002,
+      "loss": 1.2321,
+      "step": 720
+    },
+    {
+      "epoch": 0.5672105672105672,
+      "grad_norm": 0.22613082826137543,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 730
+    },
+    {
+      "epoch": 0.574980574980575,
+      "grad_norm": 0.20228610932826996,
+      "learning_rate": 0.0002,
+      "loss": 1.2138,
+      "step": 740
+    },
+    {
+      "epoch": 0.5827505827505828,
+      "grad_norm": 0.1765434592962265,
+      "learning_rate": 0.0002,
+      "loss": 1.2434,
+      "step": 750
+    },
+    {
+      "epoch": 0.5905205905205905,
+      "grad_norm": 0.20663489401340485,
+      "learning_rate": 0.0002,
+      "loss": 1.0774,
+      "step": 760
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.19824334979057312,
+      "learning_rate": 0.0002,
+      "loss": 1.1889,
+      "step": 770
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.20710207521915436,
+      "learning_rate": 0.0002,
+      "loss": 1.0703,
+      "step": 780
+    },
+    {
+      "epoch": 0.6138306138306139,
+      "grad_norm": 0.15639153122901917,
+      "learning_rate": 0.0002,
+      "loss": 1.1448,
+      "step": 790
+    },
+    {
+      "epoch": 0.6216006216006216,
+      "grad_norm": 0.29030027985572815,
+      "learning_rate": 0.0002,
+      "loss": 1.093,
+      "step": 800
+    },
+    {
+      "epoch": 0.6293706293706294,
+      "grad_norm": 0.21650588512420654,
+      "learning_rate": 0.0002,
+      "loss": 1.0669,
+      "step": 810
+    },
+    {
+      "epoch": 0.6371406371406372,
+      "grad_norm": 0.5422983169555664,
+      "learning_rate": 0.0002,
+      "loss": 1.1659,
+      "step": 820
+    },
+    {
+      "epoch": 0.6449106449106449,
+      "grad_norm": 0.2035626322031021,
+      "learning_rate": 0.0002,
+      "loss": 1.0996,
+      "step": 830
+    },
+    {
+      "epoch": 0.6526806526806527,
+      "grad_norm": 0.23480531573295593,
+      "learning_rate": 0.0002,
+      "loss": 1.2308,
+      "step": 840
+    },
+    {
+      "epoch": 0.6604506604506605,
+      "grad_norm": 0.19458115100860596,
+      "learning_rate": 0.0002,
+      "loss": 1.2648,
+      "step": 850
+    },
+    {
+      "epoch": 0.6682206682206682,
+      "grad_norm": 0.2670581638813019,
+      "learning_rate": 0.0002,
+      "loss": 1.0638,
+      "step": 860
+    },
+    {
+      "epoch": 0.675990675990676,
+      "grad_norm": 0.1925385743379593,
+      "learning_rate": 0.0002,
+      "loss": 1.1718,
+      "step": 870
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.23513855040073395,
+      "learning_rate": 0.0002,
+      "loss": 1.068,
+      "step": 880
+    },
+    {
+      "epoch": 0.6915306915306916,
+      "grad_norm": 0.2000550478696823,
+      "learning_rate": 0.0002,
+      "loss": 1.2692,
+      "step": 890
+    },
+    {
+      "epoch": 0.6993006993006993,
+      "grad_norm": 0.26091018319129944,
+      "learning_rate": 0.0002,
+      "loss": 1.1071,
+      "step": 900
+    },
+    {
+      "epoch": 0.7070707070707071,
+      "grad_norm": 0.18439704179763794,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 910
+    },
+    {
+      "epoch": 0.7148407148407149,
+      "grad_norm": 0.2290486991405487,
+      "learning_rate": 0.0002,
+      "loss": 1.1189,
+      "step": 920
+    },
+    {
+      "epoch": 0.7226107226107226,
+      "grad_norm": 0.2338181883096695,
+      "learning_rate": 0.0002,
+      "loss": 1.1858,
+      "step": 930
+    },
+    {
+      "epoch": 0.7303807303807304,
+      "grad_norm": 0.19802013039588928,
+      "learning_rate": 0.0002,
+      "loss": 0.9945,
+      "step": 940
+    },
+    {
+      "epoch": 0.7381507381507382,
+      "grad_norm": 0.19329555332660675,
+      "learning_rate": 0.0002,
+      "loss": 0.9957,
+      "step": 950
+    },
+    {
+      "epoch": 0.745920745920746,
+      "grad_norm": 0.22874726355075836,
+      "learning_rate": 0.0002,
+      "loss": 1.0822,
+      "step": 960
+    },
+    {
+      "epoch": 0.7536907536907537,
+      "grad_norm": 0.24897505342960358,
+      "learning_rate": 0.0002,
+      "loss": 1.1034,
+      "step": 970
+    },
+    {
+      "epoch": 0.7614607614607615,
+      "grad_norm": 0.19370736181735992,
+      "learning_rate": 0.0002,
+      "loss": 1.1134,
+      "step": 980
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.18071319162845612,
+      "learning_rate": 0.0002,
+      "loss": 1.0785,
+      "step": 990
+    },
+    {
+      "epoch": 0.777000777000777,
+      "grad_norm": 0.941108226776123,
+      "learning_rate": 0.0002,
+      "loss": 1.1866,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7847707847707848,
+      "grad_norm": 0.21083903312683105,
+      "learning_rate": 0.0002,
+      "loss": 1.1012,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7925407925407926,
+      "grad_norm": 0.2541967034339905,
+      "learning_rate": 0.0002,
+      "loss": 1.1665,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8003108003108003,
+      "grad_norm": 0.2519970238208771,
+      "learning_rate": 0.0002,
+      "loss": 1.0513,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8080808080808081,
+      "grad_norm": 0.23185153305530548,
+      "learning_rate": 0.0002,
+      "loss": 1.0795,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8158508158508159,
+      "grad_norm": 0.16371922194957733,
+      "learning_rate": 0.0002,
+      "loss": 1.0451,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8236208236208237,
+      "grad_norm": 0.23721691966056824,
+      "learning_rate": 0.0002,
+      "loss": 1.2314,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8313908313908314,
+      "grad_norm": 0.28295132517814636,
+      "learning_rate": 0.0002,
+      "loss": 1.1663,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8391608391608392,
+      "grad_norm": 0.2982921004295349,
+      "learning_rate": 0.0002,
+      "loss": 1.0625,
+      "step": 1080
+    },
+    {
+      "epoch": 0.846930846930847,
+      "grad_norm": 0.2634515166282654,
+      "learning_rate": 0.0002,
+      "loss": 1.1936,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.2118266075849533,
+      "learning_rate": 0.0002,
+      "loss": 1.2291,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8624708624708625,
+      "grad_norm": 0.2321753352880478,
+      "learning_rate": 0.0002,
+      "loss": 1.1269,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8702408702408703,
+      "grad_norm": 0.1881166696548462,
+      "learning_rate": 0.0002,
+      "loss": 1.122,
+      "step": 1120
+    },
+    {
+      "epoch": 0.878010878010878,
+      "grad_norm": 0.1806720793247223,
+      "learning_rate": 0.0002,
+      "loss": 1.2244,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8857808857808858,
+      "grad_norm": 0.2697276473045349,
+      "learning_rate": 0.0002,
+      "loss": 1.1129,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8935508935508936,
+      "grad_norm": 0.1976407915353775,
+      "learning_rate": 0.0002,
+      "loss": 1.0894,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9013209013209014,
+      "grad_norm": 0.2299172729253769,
+      "learning_rate": 0.0002,
+      "loss": 1.1296,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.2056407332420349,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9168609168609169,
+      "grad_norm": 0.2531585395336151,
+      "learning_rate": 0.0002,
+      "loss": 0.9501,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9246309246309247,
+      "grad_norm": 0.21540220081806183,
+      "learning_rate": 0.0002,
+      "loss": 0.9781,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9324009324009324,
+      "grad_norm": 0.23623155057430267,
+      "learning_rate": 0.0002,
+      "loss": 1.2535,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.15985913574695587,
+      "learning_rate": 0.0002,
+      "loss": 1.2165,
+      "step": 1210
+    },
+    {
+      "epoch": 0.947940947940948,
+      "grad_norm": 0.21025371551513672,
+      "learning_rate": 0.0002,
+      "loss": 1.1809,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9557109557109557,
+      "grad_norm": 0.21953997015953064,
+      "learning_rate": 0.0002,
+      "loss": 1.0808,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9634809634809635,
+      "grad_norm": 0.2306654453277588,
+      "learning_rate": 0.0002,
+      "loss": 1.0345,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9712509712509713,
+      "grad_norm": 0.18655075132846832,
+      "learning_rate": 0.0002,
+      "loss": 1.168,
+      "step": 1250
+    },
+    {
+      "epoch": 0.9790209790209791,
+      "grad_norm": 0.1910385638475418,
+      "learning_rate": 0.0002,
+      "loss": 1.1434,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9867909867909868,
+      "grad_norm": 0.1794031411409378,
+      "learning_rate": 0.0002,
+      "loss": 1.1564,
+      "step": 1270
+    },
+    {
+      "epoch": 0.9945609945609946,
+      "grad_norm": 0.21666628122329712,
+      "learning_rate": 0.0002,
+      "loss": 1.147,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.129408359527588,
+      "eval_runtime": 126.1267,
+      "eval_samples_per_second": 3.631,
+      "eval_steps_per_second": 0.46,
+      "step": 1287
+    },
+    {
+      "epoch": 1.0023310023310024,
+      "grad_norm": 0.21446684002876282,
+      "learning_rate": 0.0002,
+      "loss": 1.0804,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0101010101010102,
+      "grad_norm": 0.19008079171180725,
+      "learning_rate": 0.0002,
+      "loss": 1.0593,
+      "step": 1300
+    },
+    {
+      "epoch": 1.017871017871018,
+      "grad_norm": 0.2067517787218094,
+      "learning_rate": 0.0002,
+      "loss": 1.015,
+      "step": 1310
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.23490257561206818,
+      "learning_rate": 0.0002,
+      "loss": 1.1278,
+      "step": 1320
+    },
+    {
+      "epoch": 1.0334110334110334,
+      "grad_norm": 0.2786276340484619,
+      "learning_rate": 0.0002,
+      "loss": 1.0911,
+      "step": 1330
+    },
+    {
+      "epoch": 1.0411810411810412,
+      "grad_norm": 0.22309809923171997,
+      "learning_rate": 0.0002,
+      "loss": 1.1132,
+      "step": 1340
+    },
+    {
+      "epoch": 1.048951048951049,
+      "grad_norm": 0.1913406401872635,
+      "learning_rate": 0.0002,
+      "loss": 1.0291,
+      "step": 1350
+    },
+    {
+      "epoch": 1.0567210567210568,
+      "grad_norm": 0.3459428548812866,
+      "learning_rate": 0.0002,
+      "loss": 1.273,
+      "step": 1360
+    },
+    {
+      "epoch": 1.0644910644910646,
+      "grad_norm": 0.23205991089344025,
+      "learning_rate": 0.0002,
+      "loss": 1.1001,
+      "step": 1370
+    },
+    {
+      "epoch": 1.0722610722610724,
+      "grad_norm": 0.22524729371070862,
+      "learning_rate": 0.0002,
+      "loss": 1.0243,
+      "step": 1380
+    },
+    {
+      "epoch": 1.08003108003108,
+      "grad_norm": 0.22871436178684235,
+      "learning_rate": 0.0002,
+      "loss": 1.0949,
+      "step": 1390
+    },
+    {
+      "epoch": 1.0878010878010878,
+      "grad_norm": 0.31112268567085266,
+      "learning_rate": 0.0002,
+      "loss": 1.0513,
+      "step": 1400
+    },
+    {
+      "epoch": 1.0955710955710956,
+      "grad_norm": 0.19766485691070557,
+      "learning_rate": 0.0002,
+      "loss": 0.9782,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1033411033411034,
+      "grad_norm": 0.2941485047340393,
+      "learning_rate": 0.0002,
+      "loss": 0.982,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.24462586641311646,
+      "learning_rate": 0.0002,
+      "loss": 1.0206,
+      "step": 1430
+    },
+    {
+      "epoch": 1.118881118881119,
+      "grad_norm": 0.2066200077533722,
+      "learning_rate": 0.0002,
+      "loss": 1.0278,
+      "step": 1440
+    },
+    {
+      "epoch": 1.1266511266511268,
+      "grad_norm": 0.2350434511899948,
+      "learning_rate": 0.0002,
+      "loss": 1.0144,
+      "step": 1450
+    },
+    {
+      "epoch": 1.1344211344211343,
+      "grad_norm": 0.287502646446228,
+      "learning_rate": 0.0002,
+      "loss": 1.0311,
+      "step": 1460
+    },
+    {
+      "epoch": 1.1421911421911422,
+      "grad_norm": 0.2597036361694336,
+      "learning_rate": 0.0002,
+      "loss": 1.0736,
+      "step": 1470
+    },
+    {
+      "epoch": 1.14996114996115,
+      "grad_norm": 0.2032802850008011,
+      "learning_rate": 0.0002,
+      "loss": 0.929,
+      "step": 1480
+    },
+    {
+      "epoch": 1.1577311577311578,
+      "grad_norm": 0.22740910947322845,
+      "learning_rate": 0.0002,
+      "loss": 1.046,
+      "step": 1490
+    },
+    {
+      "epoch": 1.1655011655011656,
+      "grad_norm": 0.22720582783222198,
+      "learning_rate": 0.0002,
+      "loss": 1.0735,
+      "step": 1500
+    },
+    {
+      "epoch": 1.1732711732711734,
+      "grad_norm": 0.20722301304340363,
+      "learning_rate": 0.0002,
+      "loss": 1.1148,
+      "step": 1510
+    },
+    {
+      "epoch": 1.1810411810411812,
+      "grad_norm": 0.2846643626689911,
+      "learning_rate": 0.0002,
+      "loss": 1.126,
+      "step": 1520
+    },
+    {
+      "epoch": 1.1888111888111887,
+      "grad_norm": 0.24171462655067444,
+      "learning_rate": 0.0002,
+      "loss": 1.045,
+      "step": 1530
+    },
+    {
+      "epoch": 1.1965811965811965,
+      "grad_norm": 0.25066953897476196,
+      "learning_rate": 0.0002,
+      "loss": 1.1682,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2043512043512044,
+      "grad_norm": 0.23974713683128357,
+      "learning_rate": 0.0002,
+      "loss": 1.031,
+      "step": 1550
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 0.22091583907604218,
+      "learning_rate": 0.0002,
+      "loss": 1.1378,
+      "step": 1560
+    },
+    {
+      "epoch": 1.21989121989122,
+      "grad_norm": 0.24051672220230103,
+      "learning_rate": 0.0002,
+      "loss": 0.9938,
+      "step": 1570
+    },
+    {
+      "epoch": 1.2276612276612275,
+      "grad_norm": 0.26666808128356934,
+      "learning_rate": 0.0002,
+      "loss": 1.1118,
+      "step": 1580
+    },
+    {
+      "epoch": 1.2354312354312353,
+      "grad_norm": 0.2351372390985489,
+      "learning_rate": 0.0002,
+      "loss": 1.0123,
+      "step": 1590
+    },
+    {
+      "epoch": 1.2432012432012431,
+      "grad_norm": 0.23197518289089203,
+      "learning_rate": 0.0002,
+      "loss": 1.0407,
+      "step": 1600
+    },
+    {
+      "epoch": 1.250971250971251,
+      "grad_norm": 0.24768364429473877,
+      "learning_rate": 0.0002,
+      "loss": 1.1263,
+      "step": 1610
+    },
+    {
+      "epoch": 1.2587412587412588,
+      "grad_norm": 0.3164500594139099,
+      "learning_rate": 0.0002,
+      "loss": 1.0233,
+      "step": 1620
+    },
+    {
+      "epoch": 1.2665112665112666,
+      "grad_norm": 0.23763787746429443,
+      "learning_rate": 0.0002,
+      "loss": 1.101,
+      "step": 1630
+    },
+    {
+      "epoch": 1.2742812742812744,
+      "grad_norm": 0.2657753527164459,
+      "learning_rate": 0.0002,
+      "loss": 1.1305,
+      "step": 1640
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.19181232154369354,
+      "learning_rate": 0.0002,
+      "loss": 1.041,
+      "step": 1650
+    },
+    {
+      "epoch": 1.2898212898212897,
+      "grad_norm": 0.24094757437705994,
+      "learning_rate": 0.0002,
+      "loss": 1.0958,
+      "step": 1660
+    },
+    {
+      "epoch": 1.2975912975912975,
+      "grad_norm": 0.28938543796539307,
+      "learning_rate": 0.0002,
+      "loss": 1.0631,
+      "step": 1670
+    },
+    {
+      "epoch": 1.3053613053613053,
+      "grad_norm": 0.27220043540000916,
+      "learning_rate": 0.0002,
+      "loss": 1.1042,
+      "step": 1680
+    },
+    {
+      "epoch": 1.3131313131313131,
+      "grad_norm": 0.366868257522583,
+      "learning_rate": 0.0002,
+      "loss": 1.0575,
+      "step": 1690
+    },
+    {
+      "epoch": 1.320901320901321,
+      "grad_norm": 0.21456655859947205,
+      "learning_rate": 0.0002,
+      "loss": 1.0158,
+      "step": 1700
+    },
+    {
+      "epoch": 1.3286713286713288,
+      "grad_norm": 0.3704378306865692,
+      "learning_rate": 0.0002,
+      "loss": 1.0548,
+      "step": 1710
+    },
+    {
+      "epoch": 1.3364413364413363,
+      "grad_norm": 0.2771696448326111,
+      "learning_rate": 0.0002,
+      "loss": 1.0176,
+      "step": 1720
+    },
+    {
+      "epoch": 1.3442113442113441,
+      "grad_norm": 0.2547997534275055,
+      "learning_rate": 0.0002,
+      "loss": 0.9911,
+      "step": 1730
+    },
+    {
+      "epoch": 1.351981351981352,
+      "grad_norm": 0.285001277923584,
+      "learning_rate": 0.0002,
+      "loss": 1.0533,
+      "step": 1740
+    },
+    {
+      "epoch": 1.3597513597513597,
+      "grad_norm": 0.24063028395175934,
+      "learning_rate": 0.0002,
+      "loss": 1.0944,
+      "step": 1750
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.21229241788387299,
+      "learning_rate": 0.0002,
+      "loss": 0.9977,
+      "step": 1760
+    },
+    {
+      "epoch": 1.3752913752913754,
+      "grad_norm": 0.2268926501274109,
+      "learning_rate": 0.0002,
+      "loss": 0.9936,
+      "step": 1770
+    },
+    {
+      "epoch": 1.3830613830613832,
+      "grad_norm": 0.3715478181838989,
+      "learning_rate": 0.0002,
+      "loss": 1.0067,
+      "step": 1780
+    },
+    {
+      "epoch": 1.3908313908313907,
+      "grad_norm": 0.21592973172664642,
+      "learning_rate": 0.0002,
+      "loss": 1.0999,
+      "step": 1790
+    },
+    {
+      "epoch": 1.3986013986013985,
+      "grad_norm": 0.2854160666465759,
+      "learning_rate": 0.0002,
+      "loss": 1.0221,
+      "step": 1800
+    },
+    {
+      "epoch": 1.4063714063714063,
+      "grad_norm": 0.2258583903312683,
+      "learning_rate": 0.0002,
+      "loss": 1.0108,
+      "step": 1810
+    },
+    {
+      "epoch": 1.4141414141414141,
+      "grad_norm": 0.2583931088447571,
+      "learning_rate": 0.0002,
+      "loss": 1.0231,
+      "step": 1820
+    },
+    {
+      "epoch": 1.421911421911422,
+      "grad_norm": 0.2097582221031189,
+      "learning_rate": 0.0002,
+      "loss": 0.9608,
+      "step": 1830
+    },
+    {
+      "epoch": 1.4296814296814297,
+      "grad_norm": 0.317903995513916,
+      "learning_rate": 0.0002,
+      "loss": 1.1491,
+      "step": 1840
+    },
+    {
+      "epoch": 1.4374514374514376,
+      "grad_norm": 0.3420594036579132,
+      "learning_rate": 0.0002,
+      "loss": 0.917,
+      "step": 1850
+    },
+    {
+      "epoch": 1.4452214452214451,
+      "grad_norm": 0.207811176776886,
+      "learning_rate": 0.0002,
+      "loss": 1.0408,
+      "step": 1860
+    },
+    {
+      "epoch": 1.452991452991453,
+      "grad_norm": 0.2251322716474533,
+      "learning_rate": 0.0002,
+      "loss": 1.2072,
+      "step": 1870
+    },
+    {
+      "epoch": 1.4607614607614607,
+      "grad_norm": 0.3275671899318695,
+      "learning_rate": 0.0002,
+      "loss": 1.0406,
+      "step": 1880
+    },
+    {
+      "epoch": 1.4685314685314685,
+      "grad_norm": 0.26664847135543823,
+      "learning_rate": 0.0002,
+      "loss": 1.0132,
+      "step": 1890
+    },
+    {
+      "epoch": 1.4763014763014763,
+      "grad_norm": 0.3018657863140106,
+      "learning_rate": 0.0002,
+      "loss": 1.0126,
+      "step": 1900
+    },
+    {
+      "epoch": 1.4840714840714841,
+      "grad_norm": 0.23472832143306732,
+      "learning_rate": 0.0002,
+      "loss": 1.0279,
+      "step": 1910
+    },
+    {
+      "epoch": 1.491841491841492,
+      "grad_norm": 0.25500139594078064,
+      "learning_rate": 0.0002,
+      "loss": 1.114,
+      "step": 1920
+    },
+    {
+      "epoch": 1.4996114996114995,
+      "grad_norm": 0.25684499740600586,
+      "learning_rate": 0.0002,
+      "loss": 0.9935,
+      "step": 1930
+    },
+    {
+      "epoch": 1.5073815073815073,
+      "grad_norm": 0.24417231976985931,
+      "learning_rate": 0.0002,
+      "loss": 1.0129,
+      "step": 1940
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.24856749176979065,
+      "learning_rate": 0.0002,
+      "loss": 1.1556,
+      "step": 1950
+    },
+    {
+      "epoch": 1.522921522921523,
+      "grad_norm": 0.21641327440738678,
+      "learning_rate": 0.0002,
+      "loss": 1.0385,
+      "step": 1960
+    },
+    {
+      "epoch": 1.5306915306915307,
+      "grad_norm": 0.2609858810901642,
+      "learning_rate": 0.0002,
+      "loss": 0.993,
+      "step": 1970
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.3511698842048645,
+      "learning_rate": 0.0002,
+      "loss": 1.0208,
+      "step": 1980
+    },
+    {
+      "epoch": 1.5462315462315463,
+      "grad_norm": 0.22061559557914734,
+      "learning_rate": 0.0002,
+      "loss": 1.0802,
+      "step": 1990
+    },
+    {
+      "epoch": 1.554001554001554,
+      "grad_norm": 0.27993711829185486,
+      "learning_rate": 0.0002,
+      "loss": 1.0099,
+      "step": 2000
+    },
+    {
+      "epoch": 1.5617715617715617,
+      "grad_norm": 0.4823858141899109,
+      "learning_rate": 0.0002,
+      "loss": 0.9633,
+      "step": 2010
+    },
+    {
+      "epoch": 1.5695415695415695,
+      "grad_norm": 0.3459807336330414,
+      "learning_rate": 0.0002,
+      "loss": 0.993,
+      "step": 2020
+    },
+    {
+      "epoch": 1.5773115773115773,
+      "grad_norm": 0.36406180262565613,
+      "learning_rate": 0.0002,
+      "loss": 1.0814,
+      "step": 2030
+    },
+    {
+      "epoch": 1.5850815850815851,
+      "grad_norm": 0.3524165451526642,
+      "learning_rate": 0.0002,
+      "loss": 0.9362,
+      "step": 2040
+    },
+    {
+      "epoch": 1.5928515928515927,
+      "grad_norm": 0.21314221620559692,
+      "learning_rate": 0.0002,
+      "loss": 1.0066,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6006216006216007,
+      "grad_norm": 0.2923882305622101,
+      "learning_rate": 0.0002,
+      "loss": 1.1186,
+      "step": 2060
+    },
+    {
+      "epoch": 1.6083916083916083,
+      "grad_norm": 0.26135843992233276,
+      "learning_rate": 0.0002,
+      "loss": 1.0065,
+      "step": 2070
+    },
+    {
+      "epoch": 1.6161616161616161,
+      "grad_norm": 0.23447547852993011,
+      "learning_rate": 0.0002,
+      "loss": 1.1106,
+      "step": 2080
+    },
+    {
+      "epoch": 1.623931623931624,
+      "grad_norm": 0.34268563985824585,
+      "learning_rate": 0.0002,
+      "loss": 1.0651,
+      "step": 2090
+    },
+    {
+      "epoch": 1.6317016317016317,
+      "grad_norm": 0.38466891646385193,
+      "learning_rate": 0.0002,
+      "loss": 1.0834,
+      "step": 2100
+    },
+    {
+      "epoch": 1.6394716394716395,
+      "grad_norm": 0.34389930963516235,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 2110
+    },
+    {
+      "epoch": 1.6472416472416471,
+      "grad_norm": 0.23925693333148956,
+      "learning_rate": 0.0002,
+      "loss": 1.0787,
+      "step": 2120
+    },
+    {
+      "epoch": 1.6550116550116551,
+      "grad_norm": 0.2986578941345215,
+      "learning_rate": 0.0002,
+      "loss": 0.9894,
+      "step": 2130
+    },
+    {
+      "epoch": 1.6627816627816627,
+      "grad_norm": 0.32315072417259216,
+      "learning_rate": 0.0002,
+      "loss": 1.046,
+      "step": 2140
+    },
+    {
+      "epoch": 1.6705516705516705,
+      "grad_norm": 0.26620298624038696,
+      "learning_rate": 0.0002,
+      "loss": 0.9985,
+      "step": 2150
+    },
+    {
+      "epoch": 1.6783216783216783,
+      "grad_norm": 0.20524263381958008,
+      "learning_rate": 0.0002,
+      "loss": 1.0624,
+      "step": 2160
+    },
+    {
+      "epoch": 1.6860916860916861,
+      "grad_norm": 0.30458933115005493,
+      "learning_rate": 0.0002,
+      "loss": 1.1447,
+      "step": 2170
+    },
+    {
+      "epoch": 1.693861693861694,
+      "grad_norm": 0.2668599486351013,
+      "learning_rate": 0.0002,
+      "loss": 1.0715,
+      "step": 2180
+    },
+    {
+      "epoch": 1.7016317016317015,
+      "grad_norm": 0.25038737058639526,
+      "learning_rate": 0.0002,
+      "loss": 1.036,
+      "step": 2190
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.31270867586135864,
+      "learning_rate": 0.0002,
+      "loss": 1.0005,
+      "step": 2200
+    },
+    {
+      "epoch": 1.7171717171717171,
+      "grad_norm": 0.37731093168258667,
+      "learning_rate": 0.0002,
+      "loss": 1.1114,
+      "step": 2210
+    },
+    {
+      "epoch": 1.724941724941725,
+      "grad_norm": 0.28433677554130554,
+      "learning_rate": 0.0002,
+      "loss": 0.9992,
+      "step": 2220
+    },
+    {
+      "epoch": 1.7327117327117327,
+      "grad_norm": 0.30233511328697205,
+      "learning_rate": 0.0002,
+      "loss": 1.0417,
+      "step": 2230
+    },
+    {
+      "epoch": 1.7404817404817405,
+      "grad_norm": 0.29789718985557556,
+      "learning_rate": 0.0002,
+      "loss": 0.9921,
+      "step": 2240
+    },
+    {
+      "epoch": 1.7482517482517483,
+      "grad_norm": 0.26955488324165344,
+      "learning_rate": 0.0002,
+      "loss": 1.0434,
+      "step": 2250
+    },
+    {
+      "epoch": 1.756021756021756,
+      "grad_norm": 0.26009881496429443,
+      "learning_rate": 0.0002,
+      "loss": 0.9843,
+      "step": 2260
+    },
+    {
+      "epoch": 1.763791763791764,
+      "grad_norm": 0.2825351059436798,
+      "learning_rate": 0.0002,
+      "loss": 1.0736,
+      "step": 2270
+    },
+    {
+      "epoch": 1.7715617715617715,
+      "grad_norm": 0.2807641923427582,
+      "learning_rate": 0.0002,
+      "loss": 1.0114,
+      "step": 2280
+    },
+    {
+      "epoch": 1.7793317793317793,
+      "grad_norm": 0.33065202832221985,
+      "learning_rate": 0.0002,
+      "loss": 1.0286,
+      "step": 2290
+    },
+    {
+      "epoch": 1.7871017871017871,
+      "grad_norm": 0.29354771971702576,
+      "learning_rate": 0.0002,
+      "loss": 0.9616,
+      "step": 2300
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 0.23706668615341187,
+      "learning_rate": 0.0002,
+      "loss": 0.9911,
+      "step": 2310
+    },
+    {
+      "epoch": 1.8026418026418027,
+      "grad_norm": 0.3261558413505554,
+      "learning_rate": 0.0002,
+      "loss": 0.9754,
+      "step": 2320
+    },
+    {
+      "epoch": 1.8104118104118103,
+      "grad_norm": 0.28066813945770264,
+      "learning_rate": 0.0002,
+      "loss": 0.925,
+      "step": 2330
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.25087133049964905,
+      "learning_rate": 0.0002,
+      "loss": 1.2104,
+      "step": 2340
+    },
+    {
+      "epoch": 1.825951825951826,
+      "grad_norm": 0.22244273126125336,
+      "learning_rate": 0.0002,
+      "loss": 1.094,
+      "step": 2350
+    },
+    {
+      "epoch": 1.8337218337218337,
+      "grad_norm": 0.24885690212249756,
+      "learning_rate": 0.0002,
+      "loss": 1.0184,
+      "step": 2360
+    },
+    {
+      "epoch": 1.8414918414918415,
+      "grad_norm": 0.2348184436559677,
+      "learning_rate": 0.0002,
+      "loss": 1.032,
+      "step": 2370
+    },
+    {
+      "epoch": 1.8492618492618491,
+      "grad_norm": 0.2076595574617386,
+      "learning_rate": 0.0002,
+      "loss": 1.0113,
+      "step": 2380
+    },
+    {
+      "epoch": 1.8570318570318571,
+      "grad_norm": 0.22159697115421295,
+      "learning_rate": 0.0002,
+      "loss": 1.2121,
+      "step": 2390
+    },
+    {
+      "epoch": 1.8648018648018647,
+      "grad_norm": 0.32671627402305603,
+      "learning_rate": 0.0002,
+      "loss": 1.0505,
+      "step": 2400
+    },
+    {
+      "epoch": 1.8725718725718725,
+      "grad_norm": 0.39369890093803406,
+      "learning_rate": 0.0002,
+      "loss": 1.0448,
+      "step": 2410
+    },
+    {
+      "epoch": 1.8803418803418803,
+      "grad_norm": 0.2516529858112335,
+      "learning_rate": 0.0002,
+      "loss": 1.0147,
+      "step": 2420
+    },
+    {
+      "epoch": 1.8881118881118881,
+      "grad_norm": 0.3072526156902313,
+      "learning_rate": 0.0002,
+      "loss": 1.1342,
+      "step": 2430
+    },
+    {
+      "epoch": 1.895881895881896,
+      "grad_norm": 0.3220444321632385,
+      "learning_rate": 0.0002,
+      "loss": 1.0353,
+      "step": 2440
+    },
+    {
+      "epoch": 1.9036519036519035,
+      "grad_norm": 0.21818310022354126,
+      "learning_rate": 0.0002,
+      "loss": 1.0583,
+      "step": 2450
+    },
+    {
+      "epoch": 1.9114219114219115,
+      "grad_norm": 0.3189812898635864,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 2460
+    },
+    {
+      "epoch": 1.9191919191919191,
+      "grad_norm": 0.20715034008026123,
+      "learning_rate": 0.0002,
+      "loss": 1.1327,
+      "step": 2470
+    },
+    {
+      "epoch": 1.926961926961927,
+      "grad_norm": 0.28550365567207336,
+      "learning_rate": 0.0002,
+      "loss": 0.9286,
+      "step": 2480
+    },
+    {
+      "epoch": 1.9347319347319347,
+      "grad_norm": 0.25586360692977905,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 2490
+    },
+    {
+      "epoch": 1.9425019425019425,
+      "grad_norm": 0.46703749895095825,
+      "learning_rate": 0.0002,
+      "loss": 1.1427,
+      "step": 2500
+    },
+    {
+      "epoch": 1.9502719502719503,
+      "grad_norm": 0.2730128765106201,
+      "learning_rate": 0.0002,
+      "loss": 0.9528,
+      "step": 2510
+    },
+    {
+      "epoch": 1.958041958041958,
+      "grad_norm": 0.32355520129203796,
+      "learning_rate": 0.0002,
+      "loss": 1.0792,
+      "step": 2520
+    },
+    {
+      "epoch": 1.965811965811966,
+      "grad_norm": 0.205318421125412,
+      "learning_rate": 0.0002,
+      "loss": 0.9698,
+      "step": 2530
+    },
+    {
+      "epoch": 1.9735819735819735,
+      "grad_norm": 0.3908768892288208,
+      "learning_rate": 0.0002,
+      "loss": 1.0714,
+      "step": 2540
+    },
+    {
+      "epoch": 1.9813519813519813,
+      "grad_norm": 0.289529025554657,
+      "learning_rate": 0.0002,
+      "loss": 1.012,
+      "step": 2550
+    },
+    {
+      "epoch": 1.9891219891219891,
+      "grad_norm": 0.2441825270652771,
+      "learning_rate": 0.0002,
+      "loss": 1.1298,
+      "step": 2560
+    },
+    {
+      "epoch": 1.996891996891997,
+      "grad_norm": 0.2515878975391388,
+      "learning_rate": 0.0002,
+      "loss": 1.229,
+      "step": 2570
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.110044240951538,
+      "eval_runtime": 86.3563,
+      "eval_samples_per_second": 5.304,
+      "eval_steps_per_second": 0.672,
+      "step": 2574
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10296,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1213862956590694e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b36338332cddabc7310f2540f453242087f16525e09249c71d1000e9b851228
+size 5560

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abd4643502bf4f418ce9aa93373acdf24248d2e0017eeddce0aa00b55881bb59
+size 80755416

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}