SirajRLX commited on Dec 24, 2025

Commit

e483cf3

verified ·

1 Parent(s): 61ac574

Add Qwen-14B SFT training run

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
sft_qwen_14B/best_adapter/README.md +207 -0
sft_qwen_14B/best_adapter/adapter_config.json +43 -0
sft_qwen_14B/best_adapter/adapter_model.safetensors +3 -0
sft_qwen_14B/best_adapter/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-1000/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-1000/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-1000/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-1000/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-1000/rng_state.pth +3 -0
sft_qwen_14B/checkpoints/checkpoint-1000/scheduler.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-1000/trainer_state.json +3623 -0
sft_qwen_14B/checkpoints/checkpoint-1000/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-1500/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-1500/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-1500/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-1500/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-1500/rng_state.pth +3 -0
sft_qwen_14B/checkpoints/checkpoint-1500/scheduler.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-1500/trainer_state.json +0 -0
sft_qwen_14B/checkpoints/checkpoint-1500/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-2000/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-2000/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-2000/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-2000/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-2000/rng_state.pth +3 -0
sft_qwen_14B/checkpoints/checkpoint-2000/scheduler.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json +0 -0
sft_qwen_14B/checkpoints/checkpoint-2000/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-2500/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-2500/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-2500/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-2500/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-2500/rng_state.pth +3 -0
sft_qwen_14B/checkpoints/checkpoint-2500/scheduler.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-2500/trainer_state.json +0 -0
sft_qwen_14B/checkpoints/checkpoint-2500/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-3000/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-3000/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-3000/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-3000/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-3000/rng_state.pth +3 -0
sft_qwen_14B/checkpoints/checkpoint-3000/scheduler.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-3000/trainer_state.json +0 -0
sft_qwen_14B/checkpoints/checkpoint-3000/training_args.bin +3 -0
sft_qwen_14B/checkpoints/checkpoint-3500/README.md +207 -0
sft_qwen_14B/checkpoints/checkpoint-3500/adapter_config.json +43 -0
sft_qwen_14B/checkpoints/checkpoint-3500/adapter_model.safetensors +3 -0
sft_qwen_14B/checkpoints/checkpoint-3500/optimizer.pt +3 -0
sft_qwen_14B/checkpoints/checkpoint-3500/rng_state.pth +3 -0

.gitattributes CHANGED Viewed

@@ -42,3 +42,4 @@ cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge
 cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb filter=lfs diff=lfs merge=lfs -text

 cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb filter=lfs diff=lfs merge=lfs -text
+sft_qwen_14B/wandb/run-20251223_142702-ldjr67u6/run-ldjr67u6.wandb filter=lfs diff=lfs merge=lfs -text

sft_qwen_14B/best_adapter/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/best_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/best_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3726a3859050973f5670196544ebe752cfffa4253f1767f5b0af266df2ace6b
+size 100715016

sft_qwen_14B/best_adapter/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dee36ed11c4fd0959b1919f775473a19f346b153e63c34bcf79a533556e19e4
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a70e161c960aa70fcafcc1f927ed2d20fab7a428206158a303bd7dde1ca82e78
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0268028a2ba7d054993db8c39cdcda05f1f60e5f99a796fc9a68ac3c248a51f
+size 14645

sft_qwen_14B/checkpoints/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71bca8257f6a48d8c00a2797d198b1ab37d03a410f59b9bb8e7d7f293ee8880b
+size 1465

sft_qwen_14B/checkpoints/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3623 @@

+{
+  "best_global_step": 1000,
+  "best_metric": 0.9606748819351196,
+  "best_model_checkpoint": "runs/instruct_run_14b_v1/checkpoints/checkpoint-1000",
+  "epoch": 0.43149946062567424,
+  "eval_steps": 100,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008629989212513484,
+      "grad_norm": 0.36567428708076477,
+      "learning_rate": 1.7969451931716084e-07,
+      "loss": 1.6794371604919434,
+      "step": 2
+    },
+    {
+      "epoch": 0.001725997842502697,
+      "grad_norm": 0.4024646580219269,
+      "learning_rate": 5.390835579514825e-07,
+      "loss": 1.6853073835372925,
+      "step": 4
+    },
+    {
+      "epoch": 0.0025889967637540453,
+      "grad_norm": 0.40199393033981323,
+      "learning_rate": 8.984725965858042e-07,
+      "loss": 1.7621158361434937,
+      "step": 6
+    },
+    {
+      "epoch": 0.003451995685005394,
+      "grad_norm": 0.35409677028656006,
+      "learning_rate": 1.257861635220126e-06,
+      "loss": 1.633257269859314,
+      "step": 8
+    },
+    {
+      "epoch": 0.004314994606256742,
+      "grad_norm": 0.39087551832199097,
+      "learning_rate": 1.6172506738544475e-06,
+      "loss": 1.7374768257141113,
+      "step": 10
+    },
+    {
+      "epoch": 0.005177993527508091,
+      "grad_norm": 0.3586857318878174,
+      "learning_rate": 1.9766397124887693e-06,
+      "loss": 1.6955714225769043,
+      "step": 12
+    },
+    {
+      "epoch": 0.006040992448759439,
+      "grad_norm": 0.32755669951438904,
+      "learning_rate": 2.3360287511230908e-06,
+      "loss": 1.720664381980896,
+      "step": 14
+    },
+    {
+      "epoch": 0.006903991370010788,
+      "grad_norm": 0.4054872691631317,
+      "learning_rate": 2.6954177897574127e-06,
+      "loss": 1.6957035064697266,
+      "step": 16
+    },
+    {
+      "epoch": 0.007766990291262136,
+      "grad_norm": 0.37593814730644226,
+      "learning_rate": 3.0548068283917343e-06,
+      "loss": 1.7286947965621948,
+      "step": 18
+    },
+    {
+      "epoch": 0.008629989212513484,
+      "grad_norm": 0.3344813287258148,
+      "learning_rate": 3.414195867026056e-06,
+      "loss": 1.727295994758606,
+      "step": 20
+    },
+    {
+      "epoch": 0.009492988133764833,
+      "grad_norm": 0.357474148273468,
+      "learning_rate": 3.7735849056603773e-06,
+      "loss": 1.6727914810180664,
+      "step": 22
+    },
+    {
+      "epoch": 0.010355987055016181,
+      "grad_norm": 0.39115726947784424,
+      "learning_rate": 4.132973944294699e-06,
+      "loss": 1.6518884897232056,
+      "step": 24
+    },
+    {
+      "epoch": 0.01121898597626753,
+      "grad_norm": 0.4711727201938629,
+      "learning_rate": 4.492362982929021e-06,
+      "loss": 1.7868088483810425,
+      "step": 26
+    },
+    {
+      "epoch": 0.012081984897518877,
+      "grad_norm": 0.34112176299095154,
+      "learning_rate": 4.851752021563342e-06,
+      "loss": 1.6127634048461914,
+      "step": 28
+    },
+    {
+      "epoch": 0.012944983818770227,
+      "grad_norm": 0.5071991682052612,
+      "learning_rate": 5.211141060197664e-06,
+      "loss": 1.7858378887176514,
+      "step": 30
+    },
+    {
+      "epoch": 0.013807982740021575,
+      "grad_norm": 0.42048847675323486,
+      "learning_rate": 5.570530098831986e-06,
+      "loss": 1.7123326063156128,
+      "step": 32
+    },
+    {
+      "epoch": 0.014670981661272923,
+      "grad_norm": 0.48883870244026184,
+      "learning_rate": 5.929919137466308e-06,
+      "loss": 1.737749695777893,
+      "step": 34
+    },
+    {
+      "epoch": 0.015533980582524271,
+      "grad_norm": 0.3311465084552765,
+      "learning_rate": 6.289308176100629e-06,
+      "loss": 1.5578981637954712,
+      "step": 36
+    },
+    {
+      "epoch": 0.01639697950377562,
+      "grad_norm": 0.5178973078727722,
+      "learning_rate": 6.64869721473495e-06,
+      "loss": 1.719806432723999,
+      "step": 38
+    },
+    {
+      "epoch": 0.017259978425026967,
+      "grad_norm": 0.47097742557525635,
+      "learning_rate": 7.008086253369272e-06,
+      "loss": 1.728212833404541,
+      "step": 40
+    },
+    {
+      "epoch": 0.018122977346278317,
+      "grad_norm": 0.5051584243774414,
+      "learning_rate": 7.367475292003594e-06,
+      "loss": 1.6542466878890991,
+      "step": 42
+    },
+    {
+      "epoch": 0.018985976267529667,
+      "grad_norm": 0.4645111560821533,
+      "learning_rate": 7.726864330637915e-06,
+      "loss": 1.7087690830230713,
+      "step": 44
+    },
+    {
+      "epoch": 0.019848975188781013,
+      "grad_norm": 0.5184999704360962,
+      "learning_rate": 8.086253369272237e-06,
+      "loss": 1.7018946409225464,
+      "step": 46
+    },
+    {
+      "epoch": 0.020711974110032363,
+      "grad_norm": 0.4543815851211548,
+      "learning_rate": 8.44564240790656e-06,
+      "loss": 1.6818269491195679,
+      "step": 48
+    },
+    {
+      "epoch": 0.021574973031283712,
+      "grad_norm": 0.44411996006965637,
+      "learning_rate": 8.80503144654088e-06,
+      "loss": 1.5772877931594849,
+      "step": 50
+    },
+    {
+      "epoch": 0.02243797195253506,
+      "grad_norm": 0.3409404158592224,
+      "learning_rate": 9.164420485175203e-06,
+      "loss": 1.498152732849121,
+      "step": 52
+    },
+    {
+      "epoch": 0.02330097087378641,
+      "grad_norm": 0.42104434967041016,
+      "learning_rate": 9.523809523809523e-06,
+      "loss": 1.6189048290252686,
+      "step": 54
+    },
+    {
+      "epoch": 0.024163969795037755,
+      "grad_norm": 0.3756246268749237,
+      "learning_rate": 9.883198562443846e-06,
+      "loss": 1.4596441984176636,
+      "step": 56
+    },
+    {
+      "epoch": 0.025026968716289105,
+      "grad_norm": 0.36214128136634827,
+      "learning_rate": 1.0242587601078168e-05,
+      "loss": 1.503880500793457,
+      "step": 58
+    },
+    {
+      "epoch": 0.025889967637540454,
+      "grad_norm": 0.40893009305000305,
+      "learning_rate": 1.060197663971249e-05,
+      "loss": 1.5912823677062988,
+      "step": 60
+    },
+    {
+      "epoch": 0.0267529665587918,
+      "grad_norm": 0.28710272908210754,
+      "learning_rate": 1.0961365678346811e-05,
+      "loss": 1.2956721782684326,
+      "step": 62
+    },
+    {
+      "epoch": 0.02761596548004315,
+      "grad_norm": 0.304573118686676,
+      "learning_rate": 1.1320754716981132e-05,
+      "loss": 1.4648056030273438,
+      "step": 64
+    },
+    {
+      "epoch": 0.0284789644012945,
+      "grad_norm": 0.36523914337158203,
+      "learning_rate": 1.1680143755615454e-05,
+      "loss": 1.6078968048095703,
+      "step": 66
+    },
+    {
+      "epoch": 0.029341963322545846,
+      "grad_norm": 0.37929031252861023,
+      "learning_rate": 1.2039532794249775e-05,
+      "loss": 1.5969421863555908,
+      "step": 68
+    },
+    {
+      "epoch": 0.030204962243797196,
+      "grad_norm": 0.3053947389125824,
+      "learning_rate": 1.2398921832884097e-05,
+      "loss": 1.4312325716018677,
+      "step": 70
+    },
+    {
+      "epoch": 0.031067961165048542,
+      "grad_norm": 0.3028779923915863,
+      "learning_rate": 1.275831087151842e-05,
+      "loss": 1.4101300239562988,
+      "step": 72
+    },
+    {
+      "epoch": 0.03193096008629989,
+      "grad_norm": 0.29649803042411804,
+      "learning_rate": 1.3117699910152742e-05,
+      "loss": 1.4553817510604858,
+      "step": 74
+    },
+    {
+      "epoch": 0.03279395900755124,
+      "grad_norm": 0.26032644510269165,
+      "learning_rate": 1.3477088948787062e-05,
+      "loss": 1.4623000621795654,
+      "step": 76
+    },
+    {
+      "epoch": 0.03365695792880259,
+      "grad_norm": 0.33558446168899536,
+      "learning_rate": 1.3836477987421385e-05,
+      "loss": 1.5181745290756226,
+      "step": 78
+    },
+    {
+      "epoch": 0.034519956850053934,
+      "grad_norm": 0.28307804465293884,
+      "learning_rate": 1.4195867026055706e-05,
+      "loss": 1.4397861957550049,
+      "step": 80
+    },
+    {
+      "epoch": 0.035382955771305284,
+      "grad_norm": 0.3451690673828125,
+      "learning_rate": 1.455525606469003e-05,
+      "loss": 1.463841199874878,
+      "step": 82
+    },
+    {
+      "epoch": 0.036245954692556634,
+      "grad_norm": 0.3248669505119324,
+      "learning_rate": 1.4914645103324348e-05,
+      "loss": 1.3554227352142334,
+      "step": 84
+    },
+    {
+      "epoch": 0.037108953613807984,
+      "grad_norm": 0.2855011224746704,
+      "learning_rate": 1.527403414195867e-05,
+      "loss": 1.2810425758361816,
+      "step": 86
+    },
+    {
+      "epoch": 0.03797195253505933,
+      "grad_norm": 0.33365535736083984,
+      "learning_rate": 1.5633423180592992e-05,
+      "loss": 1.428163766860962,
+      "step": 88
+    },
+    {
+      "epoch": 0.038834951456310676,
+      "grad_norm": 0.34099438786506653,
+      "learning_rate": 1.5992812219227316e-05,
+      "loss": 1.3487578630447388,
+      "step": 90
+    },
+    {
+      "epoch": 0.039697950377562026,
+      "grad_norm": 0.39247506856918335,
+      "learning_rate": 1.6352201257861635e-05,
+      "loss": 1.30057954788208,
+      "step": 92
+    },
+    {
+      "epoch": 0.040560949298813376,
+      "grad_norm": 0.32692041993141174,
+      "learning_rate": 1.671159029649596e-05,
+      "loss": 1.2923580408096313,
+      "step": 94
+    },
+    {
+      "epoch": 0.041423948220064725,
+      "grad_norm": 0.43452519178390503,
+      "learning_rate": 1.707097933513028e-05,
+      "loss": 1.5002273321151733,
+      "step": 96
+    },
+    {
+      "epoch": 0.042286947141316075,
+      "grad_norm": 0.3251534402370453,
+      "learning_rate": 1.7430368373764602e-05,
+      "loss": 1.330254077911377,
+      "step": 98
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "grad_norm": 0.3198273479938507,
+      "learning_rate": 1.778975741239892e-05,
+      "loss": 1.3054943084716797,
+      "step": 100
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "eval_loss": 1.366738200187683,
+      "eval_runtime": 651.8198,
+      "eval_samples_per_second": 3.16,
+      "eval_steps_per_second": 3.16,
+      "step": 100
+    },
+    {
+      "epoch": 0.04401294498381877,
+      "grad_norm": 0.37364065647125244,
+      "learning_rate": 1.8149146451033245e-05,
+      "loss": 1.314281940460205,
+      "step": 102
+    },
+    {
+      "epoch": 0.04487594390507012,
+      "grad_norm": 0.39384758472442627,
+      "learning_rate": 1.8508535489667568e-05,
+      "loss": 1.2737246751785278,
+      "step": 104
+    },
+    {
+      "epoch": 0.04573894282632147,
+      "grad_norm": 0.3521905541419983,
+      "learning_rate": 1.8867924528301888e-05,
+      "loss": 1.3113226890563965,
+      "step": 106
+    },
+    {
+      "epoch": 0.04660194174757282,
+      "grad_norm": 0.33531463146209717,
+      "learning_rate": 1.9227313566936208e-05,
+      "loss": 1.3253653049468994,
+      "step": 108
+    },
+    {
+      "epoch": 0.04746494066882417,
+      "grad_norm": 0.35596340894699097,
+      "learning_rate": 1.958670260557053e-05,
+      "loss": 1.3236849308013916,
+      "step": 110
+    },
+    {
+      "epoch": 0.04832793959007551,
+      "grad_norm": 0.36028242111206055,
+      "learning_rate": 1.9946091644204854e-05,
+      "loss": 1.183128833770752,
+      "step": 112
+    },
+    {
+      "epoch": 0.04919093851132686,
+      "grad_norm": 0.42109814286231995,
+      "learning_rate": 2.0305480682839174e-05,
+      "loss": 1.2741888761520386,
+      "step": 114
+    },
+    {
+      "epoch": 0.05005393743257821,
+      "grad_norm": 0.39675939083099365,
+      "learning_rate": 2.0664869721473494e-05,
+      "loss": 1.3050109148025513,
+      "step": 116
+    },
+    {
+      "epoch": 0.05091693635382956,
+      "grad_norm": 0.4414141774177551,
+      "learning_rate": 2.1024258760107817e-05,
+      "loss": 1.2472094297409058,
+      "step": 118
+    },
+    {
+      "epoch": 0.05177993527508091,
+      "grad_norm": 0.42872729897499084,
+      "learning_rate": 2.138364779874214e-05,
+      "loss": 1.3338921070098877,
+      "step": 120
+    },
+    {
+      "epoch": 0.05264293419633225,
+      "grad_norm": 0.38336244225502014,
+      "learning_rate": 2.174303683737646e-05,
+      "loss": 1.322908878326416,
+      "step": 122
+    },
+    {
+      "epoch": 0.0535059331175836,
+      "grad_norm": 0.41046878695487976,
+      "learning_rate": 2.2102425876010783e-05,
+      "loss": 1.2169240713119507,
+      "step": 124
+    },
+    {
+      "epoch": 0.05436893203883495,
+      "grad_norm": 0.39460113644599915,
+      "learning_rate": 2.2461814914645103e-05,
+      "loss": 1.2085309028625488,
+      "step": 126
+    },
+    {
+      "epoch": 0.0552319309600863,
+      "grad_norm": 0.42829909920692444,
+      "learning_rate": 2.2821203953279426e-05,
+      "loss": 1.2969133853912354,
+      "step": 128
+    },
+    {
+      "epoch": 0.05609492988133765,
+      "grad_norm": 0.3940851390361786,
+      "learning_rate": 2.3180592991913746e-05,
+      "loss": 1.1892330646514893,
+      "step": 130
+    },
+    {
+      "epoch": 0.056957928802589,
+      "grad_norm": 0.45011839270591736,
+      "learning_rate": 2.353998203054807e-05,
+      "loss": 1.2082979679107666,
+      "step": 132
+    },
+    {
+      "epoch": 0.05782092772384034,
+      "grad_norm": 0.46059420704841614,
+      "learning_rate": 2.3899371069182393e-05,
+      "loss": 1.2388817071914673,
+      "step": 134
+    },
+    {
+      "epoch": 0.05868392664509169,
+      "grad_norm": 0.41085872054100037,
+      "learning_rate": 2.4258760107816713e-05,
+      "loss": 1.193917155265808,
+      "step": 136
+    },
+    {
+      "epoch": 0.05954692556634304,
+      "grad_norm": 0.4024205207824707,
+      "learning_rate": 2.4618149146451032e-05,
+      "loss": 1.1514034271240234,
+      "step": 138
+    },
+    {
+      "epoch": 0.06040992448759439,
+      "grad_norm": 0.3893793523311615,
+      "learning_rate": 2.4977538185085356e-05,
+      "loss": 1.1626157760620117,
+      "step": 140
+    },
+    {
+      "epoch": 0.06127292340884574,
+      "grad_norm": 0.4456317126750946,
+      "learning_rate": 2.5336927223719675e-05,
+      "loss": 1.1627076864242554,
+      "step": 142
+    },
+    {
+      "epoch": 0.062135922330097085,
+      "grad_norm": 0.5050215125083923,
+      "learning_rate": 2.5696316262354e-05,
+      "loss": 1.3038755655288696,
+      "step": 144
+    },
+    {
+      "epoch": 0.06299892125134844,
+      "grad_norm": 0.4071207642555237,
+      "learning_rate": 2.605570530098832e-05,
+      "loss": 1.1708844900131226,
+      "step": 146
+    },
+    {
+      "epoch": 0.06386192017259978,
+      "grad_norm": 0.4363228678703308,
+      "learning_rate": 2.641509433962264e-05,
+      "loss": 1.2149070501327515,
+      "step": 148
+    },
+    {
+      "epoch": 0.06472491909385113,
+      "grad_norm": 0.4436556398868561,
+      "learning_rate": 2.6774483378256965e-05,
+      "loss": 1.1942368745803833,
+      "step": 150
+    },
+    {
+      "epoch": 0.06558791801510248,
+      "grad_norm": 0.4068629741668701,
+      "learning_rate": 2.7133872416891288e-05,
+      "loss": 1.1799161434173584,
+      "step": 152
+    },
+    {
+      "epoch": 0.06645091693635383,
+      "grad_norm": 0.5291106700897217,
+      "learning_rate": 2.7493261455525608e-05,
+      "loss": 1.1832845211029053,
+      "step": 154
+    },
+    {
+      "epoch": 0.06731391585760518,
+      "grad_norm": 0.4410109221935272,
+      "learning_rate": 2.785265049415993e-05,
+      "loss": 1.1696993112564087,
+      "step": 156
+    },
+    {
+      "epoch": 0.06817691477885653,
+      "grad_norm": 0.4858371913433075,
+      "learning_rate": 2.8212039532794248e-05,
+      "loss": 1.2036973237991333,
+      "step": 158
+    },
+    {
+      "epoch": 0.06903991370010787,
+      "grad_norm": 0.45373693108558655,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.1145079135894775,
+      "step": 160
+    },
+    {
+      "epoch": 0.06990291262135923,
+      "grad_norm": 0.4881038963794708,
+      "learning_rate": 2.8930817610062894e-05,
+      "loss": 1.173502802848816,
+      "step": 162
+    },
+    {
+      "epoch": 0.07076591154261057,
+      "grad_norm": 0.576934814453125,
+      "learning_rate": 2.9290206648697217e-05,
+      "loss": 1.250414490699768,
+      "step": 164
+    },
+    {
+      "epoch": 0.07162891046386193,
+      "grad_norm": 0.4900001287460327,
+      "learning_rate": 2.9649595687331537e-05,
+      "loss": 1.0721495151519775,
+      "step": 166
+    },
+    {
+      "epoch": 0.07249190938511327,
+      "grad_norm": 0.4440019726753235,
+      "learning_rate": 3.000898472596586e-05,
+      "loss": 1.0689374208450317,
+      "step": 168
+    },
+    {
+      "epoch": 0.07335490830636461,
+      "grad_norm": 0.4267268180847168,
+      "learning_rate": 3.0368373764600184e-05,
+      "loss": 1.2095128297805786,
+      "step": 170
+    },
+    {
+      "epoch": 0.07421790722761597,
+      "grad_norm": 0.6062787771224976,
+      "learning_rate": 3.0727762803234503e-05,
+      "loss": 1.077776551246643,
+      "step": 172
+    },
+    {
+      "epoch": 0.07508090614886731,
+      "grad_norm": 0.49510180950164795,
+      "learning_rate": 3.108715184186882e-05,
+      "loss": 1.144006371498108,
+      "step": 174
+    },
+    {
+      "epoch": 0.07594390507011867,
+      "grad_norm": 0.4670701026916504,
+      "learning_rate": 3.144654088050314e-05,
+      "loss": 1.1663392782211304,
+      "step": 176
+    },
+    {
+      "epoch": 0.07680690399137001,
+      "grad_norm": 0.5615383386611938,
+      "learning_rate": 3.1805929919137466e-05,
+      "loss": 1.1665973663330078,
+      "step": 178
+    },
+    {
+      "epoch": 0.07766990291262135,
+      "grad_norm": 0.47305551171302795,
+      "learning_rate": 3.216531895777179e-05,
+      "loss": 1.1337063312530518,
+      "step": 180
+    },
+    {
+      "epoch": 0.07853290183387271,
+      "grad_norm": 0.5127068758010864,
+      "learning_rate": 3.252470799640611e-05,
+      "loss": 1.072874903678894,
+      "step": 182
+    },
+    {
+      "epoch": 0.07939590075512405,
+      "grad_norm": 0.632448136806488,
+      "learning_rate": 3.2884097035040436e-05,
+      "loss": 1.1577240228652954,
+      "step": 184
+    },
+    {
+      "epoch": 0.08025889967637541,
+      "grad_norm": 0.4041025638580322,
+      "learning_rate": 3.324348607367476e-05,
+      "loss": 1.1186822652816772,
+      "step": 186
+    },
+    {
+      "epoch": 0.08112189859762675,
+      "grad_norm": 0.5239102244377136,
+      "learning_rate": 3.3602875112309076e-05,
+      "loss": 1.1468429565429688,
+      "step": 188
+    },
+    {
+      "epoch": 0.08198489751887811,
+      "grad_norm": 0.4486575424671173,
+      "learning_rate": 3.39622641509434e-05,
+      "loss": 1.0017019510269165,
+      "step": 190
+    },
+    {
+      "epoch": 0.08284789644012945,
+      "grad_norm": 0.4994317293167114,
+      "learning_rate": 3.4321653189577715e-05,
+      "loss": 1.1901532411575317,
+      "step": 192
+    },
+    {
+      "epoch": 0.0837108953613808,
+      "grad_norm": 0.5023699998855591,
+      "learning_rate": 3.468104222821204e-05,
+      "loss": 1.1398564577102661,
+      "step": 194
+    },
+    {
+      "epoch": 0.08457389428263215,
+      "grad_norm": 0.5077701807022095,
+      "learning_rate": 3.504043126684636e-05,
+      "loss": 1.1390413045883179,
+      "step": 196
+    },
+    {
+      "epoch": 0.0854368932038835,
+      "grad_norm": 0.5527892112731934,
+      "learning_rate": 3.5399820305480685e-05,
+      "loss": 1.1411432027816772,
+      "step": 198
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "grad_norm": 0.5572488903999329,
+      "learning_rate": 3.575920934411501e-05,
+      "loss": 1.071260690689087,
+      "step": 200
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "eval_loss": 1.1519012451171875,
+      "eval_runtime": 654.6055,
+      "eval_samples_per_second": 3.147,
+      "eval_steps_per_second": 3.147,
+      "step": 200
+    },
+    {
+      "epoch": 0.08716289104638619,
+      "grad_norm": 0.5134095549583435,
+      "learning_rate": 3.611859838274933e-05,
+      "loss": 1.138135552406311,
+      "step": 202
+    },
+    {
+      "epoch": 0.08802588996763754,
+      "grad_norm": 0.5166040658950806,
+      "learning_rate": 3.647798742138365e-05,
+      "loss": 1.111999273300171,
+      "step": 204
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.5336993336677551,
+      "learning_rate": 3.683737646001797e-05,
+      "loss": 1.1031352281570435,
+      "step": 206
+    },
+    {
+      "epoch": 0.08975188781014024,
+      "grad_norm": 0.8289600014686584,
+      "learning_rate": 3.7196765498652294e-05,
+      "loss": 1.0388667583465576,
+      "step": 208
+    },
+    {
+      "epoch": 0.09061488673139159,
+      "grad_norm": 0.47992637753486633,
+      "learning_rate": 3.755615453728661e-05,
+      "loss": 1.0950241088867188,
+      "step": 210
+    },
+    {
+      "epoch": 0.09147788565264293,
+      "grad_norm": 0.5629691481590271,
+      "learning_rate": 3.7915543575920934e-05,
+      "loss": 1.0361733436584473,
+      "step": 212
+    },
+    {
+      "epoch": 0.09234088457389428,
+      "grad_norm": 0.5515111684799194,
+      "learning_rate": 3.827493261455526e-05,
+      "loss": 1.0922447443008423,
+      "step": 214
+    },
+    {
+      "epoch": 0.09320388349514563,
+      "grad_norm": 0.5078643560409546,
+      "learning_rate": 3.863432165318958e-05,
+      "loss": 1.0866856575012207,
+      "step": 216
+    },
+    {
+      "epoch": 0.09406688241639698,
+      "grad_norm": 0.6046127676963806,
+      "learning_rate": 3.8993710691823904e-05,
+      "loss": 1.1231595277786255,
+      "step": 218
+    },
+    {
+      "epoch": 0.09492988133764833,
+      "grad_norm": 0.6255762577056885,
+      "learning_rate": 3.935309973045822e-05,
+      "loss": 1.099171757698059,
+      "step": 220
+    },
+    {
+      "epoch": 0.09579288025889968,
+      "grad_norm": 0.6036638021469116,
+      "learning_rate": 3.971248876909254e-05,
+      "loss": 1.0557761192321777,
+      "step": 222
+    },
+    {
+      "epoch": 0.09665587918015102,
+      "grad_norm": 0.5520529747009277,
+      "learning_rate": 4.0071877807726867e-05,
+      "loss": 1.0467877388000488,
+      "step": 224
+    },
+    {
+      "epoch": 0.09751887810140238,
+      "grad_norm": 0.5958684682846069,
+      "learning_rate": 4.043126684636119e-05,
+      "loss": 1.17941153049469,
+      "step": 226
+    },
+    {
+      "epoch": 0.09838187702265372,
+      "grad_norm": 0.5283281803131104,
+      "learning_rate": 4.079065588499551e-05,
+      "loss": 1.104217767715454,
+      "step": 228
+    },
+    {
+      "epoch": 0.09924487594390508,
+      "grad_norm": 0.5608792901039124,
+      "learning_rate": 4.115004492362983e-05,
+      "loss": 1.0900640487670898,
+      "step": 230
+    },
+    {
+      "epoch": 0.10010787486515642,
+      "grad_norm": 0.555964469909668,
+      "learning_rate": 4.150943396226415e-05,
+      "loss": 0.9887422323226929,
+      "step": 232
+    },
+    {
+      "epoch": 0.10097087378640776,
+      "grad_norm": 0.5875785946846008,
+      "learning_rate": 4.1868823000898476e-05,
+      "loss": 1.1298567056655884,
+      "step": 234
+    },
+    {
+      "epoch": 0.10183387270765912,
+      "grad_norm": 0.4544795751571655,
+      "learning_rate": 4.222821203953279e-05,
+      "loss": 1.0957067012786865,
+      "step": 236
+    },
+    {
+      "epoch": 0.10269687162891046,
+      "grad_norm": 0.564145565032959,
+      "learning_rate": 4.2587601078167116e-05,
+      "loss": 1.0328738689422607,
+      "step": 238
+    },
+    {
+      "epoch": 0.10355987055016182,
+      "grad_norm": 0.6285979747772217,
+      "learning_rate": 4.294699011680144e-05,
+      "loss": 1.1085515022277832,
+      "step": 240
+    },
+    {
+      "epoch": 0.10442286947141316,
+      "grad_norm": 0.6442288756370544,
+      "learning_rate": 4.330637915543576e-05,
+      "loss": 1.1291271448135376,
+      "step": 242
+    },
+    {
+      "epoch": 0.1052858683926645,
+      "grad_norm": 0.6137154698371887,
+      "learning_rate": 4.3665768194070085e-05,
+      "loss": 1.1759567260742188,
+      "step": 244
+    },
+    {
+      "epoch": 0.10614886731391586,
+      "grad_norm": 0.5906805992126465,
+      "learning_rate": 4.402515723270441e-05,
+      "loss": 1.148414969444275,
+      "step": 246
+    },
+    {
+      "epoch": 0.1070118662351672,
+      "grad_norm": 0.5382888913154602,
+      "learning_rate": 4.438454627133873e-05,
+      "loss": 1.0749616622924805,
+      "step": 248
+    },
+    {
+      "epoch": 0.10787486515641856,
+      "grad_norm": 0.6185492873191833,
+      "learning_rate": 4.474393530997305e-05,
+      "loss": 1.2235801219940186,
+      "step": 250
+    },
+    {
+      "epoch": 0.1087378640776699,
+      "grad_norm": 0.5981597900390625,
+      "learning_rate": 4.5103324348607365e-05,
+      "loss": 1.1390639543533325,
+      "step": 252
+    },
+    {
+      "epoch": 0.10960086299892124,
+      "grad_norm": 0.5664694905281067,
+      "learning_rate": 4.546271338724169e-05,
+      "loss": 1.171774983406067,
+      "step": 254
+    },
+    {
+      "epoch": 0.1104638619201726,
+      "grad_norm": 0.7071851491928101,
+      "learning_rate": 4.582210242587601e-05,
+      "loss": 1.1704237461090088,
+      "step": 256
+    },
+    {
+      "epoch": 0.11132686084142394,
+      "grad_norm": 0.5815614461898804,
+      "learning_rate": 4.6181491464510334e-05,
+      "loss": 1.0619677305221558,
+      "step": 258
+    },
+    {
+      "epoch": 0.1121898597626753,
+      "grad_norm": 0.6481915712356567,
+      "learning_rate": 4.654088050314466e-05,
+      "loss": 1.0824390649795532,
+      "step": 260
+    },
+    {
+      "epoch": 0.11305285868392664,
+      "grad_norm": 0.5988591313362122,
+      "learning_rate": 4.690026954177898e-05,
+      "loss": 1.087929606437683,
+      "step": 262
+    },
+    {
+      "epoch": 0.113915857605178,
+      "grad_norm": 0.6545296311378479,
+      "learning_rate": 4.7259658580413304e-05,
+      "loss": 1.0936195850372314,
+      "step": 264
+    },
+    {
+      "epoch": 0.11477885652642934,
+      "grad_norm": 0.5826204419136047,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 1.0433681011199951,
+      "step": 266
+    },
+    {
+      "epoch": 0.11564185544768069,
+      "grad_norm": 0.5907514095306396,
+      "learning_rate": 4.7978436657681944e-05,
+      "loss": 1.0719536542892456,
+      "step": 268
+    },
+    {
+      "epoch": 0.11650485436893204,
+      "grad_norm": 0.524394154548645,
+      "learning_rate": 4.833782569631627e-05,
+      "loss": 1.0231504440307617,
+      "step": 270
+    },
+    {
+      "epoch": 0.11736785329018339,
+      "grad_norm": 0.5472846031188965,
+      "learning_rate": 4.869721473495058e-05,
+      "loss": 0.9905915260314941,
+      "step": 272
+    },
+    {
+      "epoch": 0.11823085221143474,
+      "grad_norm": 0.727922260761261,
+      "learning_rate": 4.9056603773584906e-05,
+      "loss": 1.213677167892456,
+      "step": 274
+    },
+    {
+      "epoch": 0.11909385113268608,
+      "grad_norm": 0.6009684801101685,
+      "learning_rate": 4.941599281221923e-05,
+      "loss": 1.0052144527435303,
+      "step": 276
+    },
+    {
+      "epoch": 0.11995685005393743,
+      "grad_norm": 0.6564669013023376,
+      "learning_rate": 4.977538185085355e-05,
+      "loss": 1.108136773109436,
+      "step": 278
+    },
+    {
+      "epoch": 0.12081984897518878,
+      "grad_norm": 0.650074303150177,
+      "learning_rate": 5.013477088948787e-05,
+      "loss": 0.9700815677642822,
+      "step": 280
+    },
+    {
+      "epoch": 0.12168284789644013,
+      "grad_norm": 0.5772947072982788,
+      "learning_rate": 5.04941599281222e-05,
+      "loss": 1.038031816482544,
+      "step": 282
+    },
+    {
+      "epoch": 0.12254584681769148,
+      "grad_norm": 0.7293002009391785,
+      "learning_rate": 5.0853548966756516e-05,
+      "loss": 1.1063730716705322,
+      "step": 284
+    },
+    {
+      "epoch": 0.12340884573894283,
+      "grad_norm": 0.7937333583831787,
+      "learning_rate": 5.1212938005390846e-05,
+      "loss": 1.128495693206787,
+      "step": 286
+    },
+    {
+      "epoch": 0.12427184466019417,
+      "grad_norm": 0.48499324917793274,
+      "learning_rate": 5.157232704402516e-05,
+      "loss": 0.9438712000846863,
+      "step": 288
+    },
+    {
+      "epoch": 0.12513484358144553,
+      "grad_norm": 0.6010656952857971,
+      "learning_rate": 5.193171608265948e-05,
+      "loss": 1.0872881412506104,
+      "step": 290
+    },
+    {
+      "epoch": 0.12599784250269688,
+      "grad_norm": 0.6240811944007874,
+      "learning_rate": 5.22911051212938e-05,
+      "loss": 1.110992193222046,
+      "step": 292
+    },
+    {
+      "epoch": 0.1268608414239482,
+      "grad_norm": 0.7172768712043762,
+      "learning_rate": 5.265049415992812e-05,
+      "loss": 1.1109752655029297,
+      "step": 294
+    },
+    {
+      "epoch": 0.12772384034519957,
+      "grad_norm": 0.6442400217056274,
+      "learning_rate": 5.300988319856245e-05,
+      "loss": 1.05553138256073,
+      "step": 296
+    },
+    {
+      "epoch": 0.12858683926645093,
+      "grad_norm": 0.7074702382087708,
+      "learning_rate": 5.3369272237196765e-05,
+      "loss": 1.0717648267745972,
+      "step": 298
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "grad_norm": 0.5277591347694397,
+      "learning_rate": 5.3728661275831095e-05,
+      "loss": 0.9777541756629944,
+      "step": 300
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "eval_loss": 1.0977506637573242,
+      "eval_runtime": 662.1728,
+      "eval_samples_per_second": 3.111,
+      "eval_steps_per_second": 3.111,
+      "step": 300
+    },
+    {
+      "epoch": 0.1303128371089536,
+      "grad_norm": 0.7252246737480164,
+      "learning_rate": 5.408805031446541e-05,
+      "loss": 1.075905203819275,
+      "step": 302
+    },
+    {
+      "epoch": 0.13117583603020497,
+      "grad_norm": 0.7003294229507446,
+      "learning_rate": 5.444743935309974e-05,
+      "loss": 1.1117515563964844,
+      "step": 304
+    },
+    {
+      "epoch": 0.13203883495145632,
+      "grad_norm": 0.5878211259841919,
+      "learning_rate": 5.480682839173406e-05,
+      "loss": 1.0289191007614136,
+      "step": 306
+    },
+    {
+      "epoch": 0.13290183387270765,
+      "grad_norm": 0.7133644223213196,
+      "learning_rate": 5.5166217430368374e-05,
+      "loss": 1.0199183225631714,
+      "step": 308
+    },
+    {
+      "epoch": 0.133764832793959,
+      "grad_norm": 0.6098423600196838,
+      "learning_rate": 5.55256064690027e-05,
+      "loss": 1.0132375955581665,
+      "step": 310
+    },
+    {
+      "epoch": 0.13462783171521037,
+      "grad_norm": 0.6386916041374207,
+      "learning_rate": 5.5884995507637014e-05,
+      "loss": 1.1595754623413086,
+      "step": 312
+    },
+    {
+      "epoch": 0.1354908306364617,
+      "grad_norm": 0.6563469767570496,
+      "learning_rate": 5.6244384546271344e-05,
+      "loss": 1.0921307802200317,
+      "step": 314
+    },
+    {
+      "epoch": 0.13635382955771305,
+      "grad_norm": 0.6388015747070312,
+      "learning_rate": 5.660377358490566e-05,
+      "loss": 1.0200815200805664,
+      "step": 316
+    },
+    {
+      "epoch": 0.1372168284789644,
+      "grad_norm": 0.6026274561882019,
+      "learning_rate": 5.696316262353999e-05,
+      "loss": 0.9339485764503479,
+      "step": 318
+    },
+    {
+      "epoch": 0.13807982740021574,
+      "grad_norm": 0.619800865650177,
+      "learning_rate": 5.732255166217431e-05,
+      "loss": 1.0268478393554688,
+      "step": 320
+    },
+    {
+      "epoch": 0.1389428263214671,
+      "grad_norm": 0.5924715399742126,
+      "learning_rate": 5.768194070080862e-05,
+      "loss": 1.1394236087799072,
+      "step": 322
+    },
+    {
+      "epoch": 0.13980582524271845,
+      "grad_norm": 0.6829012036323547,
+      "learning_rate": 5.804132973944295e-05,
+      "loss": 1.002437949180603,
+      "step": 324
+    },
+    {
+      "epoch": 0.1406688241639698,
+      "grad_norm": 0.7012544274330139,
+      "learning_rate": 5.840071877807727e-05,
+      "loss": 1.132503628730774,
+      "step": 326
+    },
+    {
+      "epoch": 0.14153182308522114,
+      "grad_norm": 0.7921599745750427,
+      "learning_rate": 5.876010781671159e-05,
+      "loss": 1.1859129667282104,
+      "step": 328
+    },
+    {
+      "epoch": 0.1423948220064725,
+      "grad_norm": 0.6373353004455566,
+      "learning_rate": 5.9119496855345916e-05,
+      "loss": 1.0896776914596558,
+      "step": 330
+    },
+    {
+      "epoch": 0.14325782092772385,
+      "grad_norm": 0.6174030900001526,
+      "learning_rate": 5.947888589398024e-05,
+      "loss": 1.0691723823547363,
+      "step": 332
+    },
+    {
+      "epoch": 0.14412081984897518,
+      "grad_norm": 0.5110617280006409,
+      "learning_rate": 5.9838274932614556e-05,
+      "loss": 1.0144777297973633,
+      "step": 334
+    },
+    {
+      "epoch": 0.14498381877022654,
+      "grad_norm": 0.5580511093139648,
+      "learning_rate": 6.019766397124887e-05,
+      "loss": 0.9955101609230042,
+      "step": 336
+    },
+    {
+      "epoch": 0.1458468176914779,
+      "grad_norm": 0.6427345275878906,
+      "learning_rate": 6.05570530098832e-05,
+      "loss": 0.9863013625144958,
+      "step": 338
+    },
+    {
+      "epoch": 0.14670981661272922,
+      "grad_norm": 0.7464537024497986,
+      "learning_rate": 6.091644204851752e-05,
+      "loss": 1.0682255029678345,
+      "step": 340
+    },
+    {
+      "epoch": 0.14757281553398058,
+      "grad_norm": 0.599926769733429,
+      "learning_rate": 6.127583108715184e-05,
+      "loss": 1.034083366394043,
+      "step": 342
+    },
+    {
+      "epoch": 0.14843581445523193,
+      "grad_norm": 0.6320257186889648,
+      "learning_rate": 6.163522012578616e-05,
+      "loss": 1.0776089429855347,
+      "step": 344
+    },
+    {
+      "epoch": 0.1492988133764833,
+      "grad_norm": 0.6565091013908386,
+      "learning_rate": 6.199460916442049e-05,
+      "loss": 1.0493087768554688,
+      "step": 346
+    },
+    {
+      "epoch": 0.15016181229773462,
+      "grad_norm": 0.6512171626091003,
+      "learning_rate": 6.23539982030548e-05,
+      "loss": 1.0469218492507935,
+      "step": 348
+    },
+    {
+      "epoch": 0.15102481121898598,
+      "grad_norm": 0.8487282991409302,
+      "learning_rate": 6.271338724168913e-05,
+      "loss": 1.0985081195831299,
+      "step": 350
+    },
+    {
+      "epoch": 0.15188781014023733,
+      "grad_norm": 0.6718961596488953,
+      "learning_rate": 6.307277628032345e-05,
+      "loss": 1.0714176893234253,
+      "step": 352
+    },
+    {
+      "epoch": 0.15275080906148866,
+      "grad_norm": 0.8175088167190552,
+      "learning_rate": 6.343216531895777e-05,
+      "loss": 1.0599322319030762,
+      "step": 354
+    },
+    {
+      "epoch": 0.15361380798274002,
+      "grad_norm": 0.6359215378761292,
+      "learning_rate": 6.37915543575921e-05,
+      "loss": 0.9268131256103516,
+      "step": 356
+    },
+    {
+      "epoch": 0.15447680690399138,
+      "grad_norm": 0.6423866748809814,
+      "learning_rate": 6.415094339622641e-05,
+      "loss": 0.9838354587554932,
+      "step": 358
+    },
+    {
+      "epoch": 0.1553398058252427,
+      "grad_norm": 0.6496716737747192,
+      "learning_rate": 6.451033243486074e-05,
+      "loss": 1.048566460609436,
+      "step": 360
+    },
+    {
+      "epoch": 0.15620280474649406,
+      "grad_norm": 0.6536920666694641,
+      "learning_rate": 6.486972147349506e-05,
+      "loss": 1.0910537242889404,
+      "step": 362
+    },
+    {
+      "epoch": 0.15706580366774542,
+      "grad_norm": 0.5832068920135498,
+      "learning_rate": 6.522911051212939e-05,
+      "loss": 0.9971448183059692,
+      "step": 364
+    },
+    {
+      "epoch": 0.15792880258899678,
+      "grad_norm": 0.6647719144821167,
+      "learning_rate": 6.558849955076371e-05,
+      "loss": 1.0496708154678345,
+      "step": 366
+    },
+    {
+      "epoch": 0.1587918015102481,
+      "grad_norm": 0.623252809047699,
+      "learning_rate": 6.594788858939802e-05,
+      "loss": 0.955894410610199,
+      "step": 368
+    },
+    {
+      "epoch": 0.15965480043149946,
+      "grad_norm": 0.6311860084533691,
+      "learning_rate": 6.630727762803235e-05,
+      "loss": 1.1304032802581787,
+      "step": 370
+    },
+    {
+      "epoch": 0.16051779935275082,
+      "grad_norm": 0.5306481122970581,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8746405243873596,
+      "step": 372
+    },
+    {
+      "epoch": 0.16138079827400215,
+      "grad_norm": 0.6249631643295288,
+      "learning_rate": 6.7026055705301e-05,
+      "loss": 0.9104986786842346,
+      "step": 374
+    },
+    {
+      "epoch": 0.1622437971952535,
+      "grad_norm": 0.6243219971656799,
+      "learning_rate": 6.738544474393532e-05,
+      "loss": 1.043666124343872,
+      "step": 376
+    },
+    {
+      "epoch": 0.16310679611650486,
+      "grad_norm": 0.6833282113075256,
+      "learning_rate": 6.774483378256963e-05,
+      "loss": 1.0504906177520752,
+      "step": 378
+    },
+    {
+      "epoch": 0.16396979503775622,
+      "grad_norm": 0.7124452590942383,
+      "learning_rate": 6.810422282120395e-05,
+      "loss": 1.0608166456222534,
+      "step": 380
+    },
+    {
+      "epoch": 0.16483279395900755,
+      "grad_norm": 0.7520908117294312,
+      "learning_rate": 6.846361185983828e-05,
+      "loss": 1.1653732061386108,
+      "step": 382
+    },
+    {
+      "epoch": 0.1656957928802589,
+      "grad_norm": 0.7121814489364624,
+      "learning_rate": 6.88230008984726e-05,
+      "loss": 1.0626367330551147,
+      "step": 384
+    },
+    {
+      "epoch": 0.16655879180151026,
+      "grad_norm": 0.6825008988380432,
+      "learning_rate": 6.918238993710691e-05,
+      "loss": 1.012121319770813,
+      "step": 386
+    },
+    {
+      "epoch": 0.1674217907227616,
+      "grad_norm": 0.4922940135002136,
+      "learning_rate": 6.954177897574124e-05,
+      "loss": 1.0576211214065552,
+      "step": 388
+    },
+    {
+      "epoch": 0.16828478964401294,
+      "grad_norm": 0.6122089624404907,
+      "learning_rate": 6.990116801437556e-05,
+      "loss": 1.03916597366333,
+      "step": 390
+    },
+    {
+      "epoch": 0.1691477885652643,
+      "grad_norm": 0.6348981261253357,
+      "learning_rate": 7.026055705300989e-05,
+      "loss": 1.17647123336792,
+      "step": 392
+    },
+    {
+      "epoch": 0.17001078748651563,
+      "grad_norm": 0.6205878257751465,
+      "learning_rate": 7.06199460916442e-05,
+      "loss": 0.9095983505249023,
+      "step": 394
+    },
+    {
+      "epoch": 0.170873786407767,
+      "grad_norm": 0.61506187915802,
+      "learning_rate": 7.097933513027853e-05,
+      "loss": 1.082506775856018,
+      "step": 396
+    },
+    {
+      "epoch": 0.17173678532901834,
+      "grad_norm": 0.6481751799583435,
+      "learning_rate": 7.133872416891285e-05,
+      "loss": 1.0716280937194824,
+      "step": 398
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "grad_norm": 0.4871014952659607,
+      "learning_rate": 7.169811320754717e-05,
+      "loss": 0.9616814851760864,
+      "step": 400
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "eval_loss": 1.0649415254592896,
+      "eval_runtime": 668.6025,
+      "eval_samples_per_second": 3.081,
+      "eval_steps_per_second": 3.081,
+      "step": 400
+    },
+    {
+      "epoch": 0.17346278317152103,
+      "grad_norm": 0.5680040121078491,
+      "learning_rate": 7.20575022461815e-05,
+      "loss": 1.0475050210952759,
+      "step": 402
+    },
+    {
+      "epoch": 0.17432578209277239,
+      "grad_norm": 0.6417813897132874,
+      "learning_rate": 7.241689128481581e-05,
+      "loss": 0.9851161241531372,
+      "step": 404
+    },
+    {
+      "epoch": 0.17518878101402374,
+      "grad_norm": 0.6600468158721924,
+      "learning_rate": 7.277628032345014e-05,
+      "loss": 1.013339638710022,
+      "step": 406
+    },
+    {
+      "epoch": 0.17605177993527507,
+      "grad_norm": 0.6733932495117188,
+      "learning_rate": 7.313566936208446e-05,
+      "loss": 0.9346804022789001,
+      "step": 408
+    },
+    {
+      "epoch": 0.17691477885652643,
+      "grad_norm": 0.6812151074409485,
+      "learning_rate": 7.349505840071879e-05,
+      "loss": 0.9890368580818176,
+      "step": 410
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.6380394697189331,
+      "learning_rate": 7.385444743935311e-05,
+      "loss": 0.8787848949432373,
+      "step": 412
+    },
+    {
+      "epoch": 0.1786407766990291,
+      "grad_norm": 0.6004905700683594,
+      "learning_rate": 7.421383647798742e-05,
+      "loss": 1.0235728025436401,
+      "step": 414
+    },
+    {
+      "epoch": 0.17950377562028047,
+      "grad_norm": 0.6569193005561829,
+      "learning_rate": 7.457322551662175e-05,
+      "loss": 0.9972385168075562,
+      "step": 416
+    },
+    {
+      "epoch": 0.18036677454153183,
+      "grad_norm": 0.6761631369590759,
+      "learning_rate": 7.493261455525607e-05,
+      "loss": 0.9593698382377625,
+      "step": 418
+    },
+    {
+      "epoch": 0.18122977346278318,
+      "grad_norm": 0.7328561544418335,
+      "learning_rate": 7.529200359389039e-05,
+      "loss": 1.0426853895187378,
+      "step": 420
+    },
+    {
+      "epoch": 0.1820927723840345,
+      "grad_norm": 0.6256070137023926,
+      "learning_rate": 7.56513926325247e-05,
+      "loss": 0.9608182311058044,
+      "step": 422
+    },
+    {
+      "epoch": 0.18295577130528587,
+      "grad_norm": 1.2549844980239868,
+      "learning_rate": 7.601078167115903e-05,
+      "loss": 1.0162668228149414,
+      "step": 424
+    },
+    {
+      "epoch": 0.18381877022653723,
+      "grad_norm": 0.6751510500907898,
+      "learning_rate": 7.637017070979335e-05,
+      "loss": 1.130725383758545,
+      "step": 426
+    },
+    {
+      "epoch": 0.18468176914778855,
+      "grad_norm": 0.7029808163642883,
+      "learning_rate": 7.672955974842768e-05,
+      "loss": 1.0384817123413086,
+      "step": 428
+    },
+    {
+      "epoch": 0.1855447680690399,
+      "grad_norm": 0.644353449344635,
+      "learning_rate": 7.7088948787062e-05,
+      "loss": 1.017020344734192,
+      "step": 430
+    },
+    {
+      "epoch": 0.18640776699029127,
+      "grad_norm": 0.6784916520118713,
+      "learning_rate": 7.744833782569631e-05,
+      "loss": 1.005354404449463,
+      "step": 432
+    },
+    {
+      "epoch": 0.1872707659115426,
+      "grad_norm": 0.5989449620246887,
+      "learning_rate": 7.780772686433064e-05,
+      "loss": 1.026848316192627,
+      "step": 434
+    },
+    {
+      "epoch": 0.18813376483279395,
+      "grad_norm": 0.6502639651298523,
+      "learning_rate": 7.816711590296496e-05,
+      "loss": 0.9891080856323242,
+      "step": 436
+    },
+    {
+      "epoch": 0.1889967637540453,
+      "grad_norm": 0.6176205277442932,
+      "learning_rate": 7.852650494159929e-05,
+      "loss": 0.966316819190979,
+      "step": 438
+    },
+    {
+      "epoch": 0.18985976267529667,
+      "grad_norm": 0.6801626086235046,
+      "learning_rate": 7.88858939802336e-05,
+      "loss": 1.123063087463379,
+      "step": 440
+    },
+    {
+      "epoch": 0.190722761596548,
+      "grad_norm": 0.6718618273735046,
+      "learning_rate": 7.924528301886794e-05,
+      "loss": 1.0467073917388916,
+      "step": 442
+    },
+    {
+      "epoch": 0.19158576051779935,
+      "grad_norm": 0.6761009097099304,
+      "learning_rate": 7.960467205750225e-05,
+      "loss": 1.0952889919281006,
+      "step": 444
+    },
+    {
+      "epoch": 0.1924487594390507,
+      "grad_norm": 0.6356327533721924,
+      "learning_rate": 7.996406109613657e-05,
+      "loss": 0.954807698726654,
+      "step": 446
+    },
+    {
+      "epoch": 0.19331175836030204,
+      "grad_norm": 0.6798669695854187,
+      "learning_rate": 8.03234501347709e-05,
+      "loss": 0.9941422343254089,
+      "step": 448
+    },
+    {
+      "epoch": 0.1941747572815534,
+      "grad_norm": 0.6511302590370178,
+      "learning_rate": 8.068283917340521e-05,
+      "loss": 1.0351495742797852,
+      "step": 450
+    },
+    {
+      "epoch": 0.19503775620280475,
+      "grad_norm": 0.6061258912086487,
+      "learning_rate": 8.104222821203954e-05,
+      "loss": 1.00546133518219,
+      "step": 452
+    },
+    {
+      "epoch": 0.1959007551240561,
+      "grad_norm": 0.6278533935546875,
+      "learning_rate": 8.140161725067386e-05,
+      "loss": 1.0778460502624512,
+      "step": 454
+    },
+    {
+      "epoch": 0.19676375404530744,
+      "grad_norm": 0.6866298317909241,
+      "learning_rate": 8.176100628930818e-05,
+      "loss": 1.0344486236572266,
+      "step": 456
+    },
+    {
+      "epoch": 0.1976267529665588,
+      "grad_norm": 0.7338075041770935,
+      "learning_rate": 8.212039532794251e-05,
+      "loss": 1.0663033723831177,
+      "step": 458
+    },
+    {
+      "epoch": 0.19848975188781015,
+      "grad_norm": 0.6811459064483643,
+      "learning_rate": 8.247978436657682e-05,
+      "loss": 0.9665339589118958,
+      "step": 460
+    },
+    {
+      "epoch": 0.19935275080906148,
+      "grad_norm": 0.6779627799987793,
+      "learning_rate": 8.283917340521114e-05,
+      "loss": 1.024712324142456,
+      "step": 462
+    },
+    {
+      "epoch": 0.20021574973031284,
+      "grad_norm": 0.6486892700195312,
+      "learning_rate": 8.319856244384546e-05,
+      "loss": 0.9699305295944214,
+      "step": 464
+    },
+    {
+      "epoch": 0.2010787486515642,
+      "grad_norm": 0.7022278308868408,
+      "learning_rate": 8.355795148247979e-05,
+      "loss": 0.9540432095527649,
+      "step": 466
+    },
+    {
+      "epoch": 0.20194174757281552,
+      "grad_norm": 0.5922990441322327,
+      "learning_rate": 8.39173405211141e-05,
+      "loss": 0.9253339767456055,
+      "step": 468
+    },
+    {
+      "epoch": 0.20280474649406688,
+      "grad_norm": 0.7076792120933533,
+      "learning_rate": 8.427672955974843e-05,
+      "loss": 0.9987741112709045,
+      "step": 470
+    },
+    {
+      "epoch": 0.20366774541531824,
+      "grad_norm": 0.6491380333900452,
+      "learning_rate": 8.463611859838275e-05,
+      "loss": 1.0249329805374146,
+      "step": 472
+    },
+    {
+      "epoch": 0.2045307443365696,
+      "grad_norm": 0.6784211993217468,
+      "learning_rate": 8.499550763701708e-05,
+      "loss": 1.0577133893966675,
+      "step": 474
+    },
+    {
+      "epoch": 0.20539374325782092,
+      "grad_norm": 0.6453303694725037,
+      "learning_rate": 8.53548966756514e-05,
+      "loss": 1.1312458515167236,
+      "step": 476
+    },
+    {
+      "epoch": 0.20625674217907228,
+      "grad_norm": 0.7431377172470093,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.0592451095581055,
+      "step": 478
+    },
+    {
+      "epoch": 0.20711974110032363,
+      "grad_norm": 0.6097649931907654,
+      "learning_rate": 8.607367475292004e-05,
+      "loss": 0.9337235689163208,
+      "step": 480
+    },
+    {
+      "epoch": 0.20798274002157496,
+      "grad_norm": 0.5693124532699585,
+      "learning_rate": 8.643306379155436e-05,
+      "loss": 0.9088928699493408,
+      "step": 482
+    },
+    {
+      "epoch": 0.20884573894282632,
+      "grad_norm": 0.7377229332923889,
+      "learning_rate": 8.679245283018869e-05,
+      "loss": 1.0729358196258545,
+      "step": 484
+    },
+    {
+      "epoch": 0.20970873786407768,
+      "grad_norm": 0.7399470210075378,
+      "learning_rate": 8.7151841868823e-05,
+      "loss": 1.0428457260131836,
+      "step": 486
+    },
+    {
+      "epoch": 0.210571736785329,
+      "grad_norm": 0.677052915096283,
+      "learning_rate": 8.751123090745734e-05,
+      "loss": 0.9940266013145447,
+      "step": 488
+    },
+    {
+      "epoch": 0.21143473570658036,
+      "grad_norm": 0.7126721739768982,
+      "learning_rate": 8.787061994609165e-05,
+      "loss": 1.011808156967163,
+      "step": 490
+    },
+    {
+      "epoch": 0.21229773462783172,
+      "grad_norm": 0.6663792729377747,
+      "learning_rate": 8.823000898472597e-05,
+      "loss": 1.0054185390472412,
+      "step": 492
+    },
+    {
+      "epoch": 0.21316073354908308,
+      "grad_norm": 0.6661092042922974,
+      "learning_rate": 8.85893980233603e-05,
+      "loss": 1.0167138576507568,
+      "step": 494
+    },
+    {
+      "epoch": 0.2140237324703344,
+      "grad_norm": 0.6975740194320679,
+      "learning_rate": 8.894878706199461e-05,
+      "loss": 1.1470818519592285,
+      "step": 496
+    },
+    {
+      "epoch": 0.21488673139158576,
+      "grad_norm": 0.6594390869140625,
+      "learning_rate": 8.930817610062893e-05,
+      "loss": 0.9619631171226501,
+      "step": 498
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "grad_norm": 0.7216679453849792,
+      "learning_rate": 8.966756513926325e-05,
+      "loss": 0.9971368312835693,
+      "step": 500
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "eval_loss": 1.0417571067810059,
+      "eval_runtime": 659.3112,
+      "eval_samples_per_second": 3.124,
+      "eval_steps_per_second": 3.124,
+      "step": 500
+    },
+    {
+      "epoch": 0.21661272923408845,
+      "grad_norm": 0.6188210844993591,
+      "learning_rate": 9.002695417789758e-05,
+      "loss": 1.0307213068008423,
+      "step": 502
+    },
+    {
+      "epoch": 0.2174757281553398,
+      "grad_norm": 0.6716445088386536,
+      "learning_rate": 9.03863432165319e-05,
+      "loss": 1.0188794136047363,
+      "step": 504
+    },
+    {
+      "epoch": 0.21833872707659116,
+      "grad_norm": 0.6790863275527954,
+      "learning_rate": 9.074573225516622e-05,
+      "loss": 0.9764845967292786,
+      "step": 506
+    },
+    {
+      "epoch": 0.2192017259978425,
+      "grad_norm": 0.6764960289001465,
+      "learning_rate": 9.110512129380054e-05,
+      "loss": 0.948829174041748,
+      "step": 508
+    },
+    {
+      "epoch": 0.22006472491909385,
+      "grad_norm": 0.6210965514183044,
+      "learning_rate": 9.146451033243486e-05,
+      "loss": 1.008013129234314,
+      "step": 510
+    },
+    {
+      "epoch": 0.2209277238403452,
+      "grad_norm": 0.7739297747612,
+      "learning_rate": 9.182389937106919e-05,
+      "loss": 1.1662557125091553,
+      "step": 512
+    },
+    {
+      "epoch": 0.22179072276159656,
+      "grad_norm": 0.7055562138557434,
+      "learning_rate": 9.21832884097035e-05,
+      "loss": 1.0325161218643188,
+      "step": 514
+    },
+    {
+      "epoch": 0.2226537216828479,
+      "grad_norm": 0.6079210042953491,
+      "learning_rate": 9.254267744833783e-05,
+      "loss": 1.00056791305542,
+      "step": 516
+    },
+    {
+      "epoch": 0.22351672060409924,
+      "grad_norm": 0.5974318981170654,
+      "learning_rate": 9.290206648697215e-05,
+      "loss": 0.9422364234924316,
+      "step": 518
+    },
+    {
+      "epoch": 0.2243797195253506,
+      "grad_norm": 0.5963430404663086,
+      "learning_rate": 9.326145552560648e-05,
+      "loss": 0.936336100101471,
+      "step": 520
+    },
+    {
+      "epoch": 0.22524271844660193,
+      "grad_norm": 0.6823658347129822,
+      "learning_rate": 9.36208445642408e-05,
+      "loss": 1.0538607835769653,
+      "step": 522
+    },
+    {
+      "epoch": 0.2261057173678533,
+      "grad_norm": 0.6409855484962463,
+      "learning_rate": 9.398023360287511e-05,
+      "loss": 1.0483653545379639,
+      "step": 524
+    },
+    {
+      "epoch": 0.22696871628910464,
+      "grad_norm": 0.6867254376411438,
+      "learning_rate": 9.433962264150944e-05,
+      "loss": 0.9668049812316895,
+      "step": 526
+    },
+    {
+      "epoch": 0.227831715210356,
+      "grad_norm": 0.5690792798995972,
+      "learning_rate": 9.469901168014376e-05,
+      "loss": 1.008763313293457,
+      "step": 528
+    },
+    {
+      "epoch": 0.22869471413160733,
+      "grad_norm": 0.5964897274971008,
+      "learning_rate": 9.505840071877809e-05,
+      "loss": 1.0816441774368286,
+      "step": 530
+    },
+    {
+      "epoch": 0.2295577130528587,
+      "grad_norm": 0.627419114112854,
+      "learning_rate": 9.54177897574124e-05,
+      "loss": 0.9265700578689575,
+      "step": 532
+    },
+    {
+      "epoch": 0.23042071197411004,
+      "grad_norm": 0.5862151980400085,
+      "learning_rate": 9.577717879604674e-05,
+      "loss": 0.9804646372795105,
+      "step": 534
+    },
+    {
+      "epoch": 0.23128371089536137,
+      "grad_norm": 0.5573718547821045,
+      "learning_rate": 9.613656783468105e-05,
+      "loss": 0.9627988934516907,
+      "step": 536
+    },
+    {
+      "epoch": 0.23214670981661273,
+      "grad_norm": 0.6705166101455688,
+      "learning_rate": 9.649595687331537e-05,
+      "loss": 1.0012824535369873,
+      "step": 538
+    },
+    {
+      "epoch": 0.23300970873786409,
+      "grad_norm": 0.6251236796379089,
+      "learning_rate": 9.685534591194969e-05,
+      "loss": 0.9568162560462952,
+      "step": 540
+    },
+    {
+      "epoch": 0.23387270765911541,
+      "grad_norm": 0.6466493010520935,
+      "learning_rate": 9.7214734950584e-05,
+      "loss": 1.031549334526062,
+      "step": 542
+    },
+    {
+      "epoch": 0.23473570658036677,
+      "grad_norm": 0.5183866024017334,
+      "learning_rate": 9.757412398921833e-05,
+      "loss": 0.8603643774986267,
+      "step": 544
+    },
+    {
+      "epoch": 0.23559870550161813,
+      "grad_norm": 0.6725775599479675,
+      "learning_rate": 9.793351302785265e-05,
+      "loss": 1.0365077257156372,
+      "step": 546
+    },
+    {
+      "epoch": 0.23646170442286948,
+      "grad_norm": 0.5972357988357544,
+      "learning_rate": 9.829290206648698e-05,
+      "loss": 0.9304701089859009,
+      "step": 548
+    },
+    {
+      "epoch": 0.2373247033441208,
+      "grad_norm": 0.5319957733154297,
+      "learning_rate": 9.86522911051213e-05,
+      "loss": 0.9575805068016052,
+      "step": 550
+    },
+    {
+      "epoch": 0.23818770226537217,
+      "grad_norm": 0.6502835750579834,
+      "learning_rate": 9.901168014375562e-05,
+      "loss": 1.0307214260101318,
+      "step": 552
+    },
+    {
+      "epoch": 0.23905070118662353,
+      "grad_norm": 0.6734047532081604,
+      "learning_rate": 9.937106918238994e-05,
+      "loss": 1.05185067653656,
+      "step": 554
+    },
+    {
+      "epoch": 0.23991370010787486,
+      "grad_norm": 0.5667978525161743,
+      "learning_rate": 9.973045822102426e-05,
+      "loss": 1.0190176963806152,
+      "step": 556
+    },
+    {
+      "epoch": 0.2407766990291262,
+      "grad_norm": 0.6370418667793274,
+      "learning_rate": 0.00010008984725965857,
+      "loss": 1.076182246208191,
+      "step": 558
+    },
+    {
+      "epoch": 0.24163969795037757,
+      "grad_norm": 0.689719021320343,
+      "learning_rate": 0.0001004492362982929,
+      "loss": 1.0408724546432495,
+      "step": 560
+    },
+    {
+      "epoch": 0.2425026968716289,
+      "grad_norm": 0.6304254531860352,
+      "learning_rate": 0.00010080862533692723,
+      "loss": 0.9869902729988098,
+      "step": 562
+    },
+    {
+      "epoch": 0.24336569579288025,
+      "grad_norm": 0.6797420382499695,
+      "learning_rate": 0.00010116801437556156,
+      "loss": 1.0198370218276978,
+      "step": 564
+    },
+    {
+      "epoch": 0.2442286947141316,
+      "grad_norm": 0.5993657112121582,
+      "learning_rate": 0.00010152740341419587,
+      "loss": 0.9947441816329956,
+      "step": 566
+    },
+    {
+      "epoch": 0.24509169363538297,
+      "grad_norm": 0.6369836330413818,
+      "learning_rate": 0.0001018867924528302,
+      "loss": 0.9722896814346313,
+      "step": 568
+    },
+    {
+      "epoch": 0.2459546925566343,
+      "grad_norm": 0.6942457556724548,
+      "learning_rate": 0.00010224618149146453,
+      "loss": 0.9716570973396301,
+      "step": 570
+    },
+    {
+      "epoch": 0.24681769147788565,
+      "grad_norm": 0.5403370261192322,
+      "learning_rate": 0.00010260557053009883,
+      "loss": 0.9797524213790894,
+      "step": 572
+    },
+    {
+      "epoch": 0.247680690399137,
+      "grad_norm": 0.5207529067993164,
+      "learning_rate": 0.00010296495956873316,
+      "loss": 0.985367476940155,
+      "step": 574
+    },
+    {
+      "epoch": 0.24854368932038834,
+      "grad_norm": 0.6751103401184082,
+      "learning_rate": 0.00010332434860736748,
+      "loss": 1.075042724609375,
+      "step": 576
+    },
+    {
+      "epoch": 0.2494066882416397,
+      "grad_norm": 0.565331220626831,
+      "learning_rate": 0.0001036837376460018,
+      "loss": 0.9273878335952759,
+      "step": 578
+    },
+    {
+      "epoch": 0.25026968716289105,
+      "grad_norm": 0.6858948469161987,
+      "learning_rate": 0.00010404312668463612,
+      "loss": 0.9872279763221741,
+      "step": 580
+    },
+    {
+      "epoch": 0.2511326860841424,
+      "grad_norm": 0.7091426253318787,
+      "learning_rate": 0.00010440251572327044,
+      "loss": 1.0038671493530273,
+      "step": 582
+    },
+    {
+      "epoch": 0.25199568500539377,
+      "grad_norm": 0.6493771076202393,
+      "learning_rate": 0.00010476190476190477,
+      "loss": 1.0109868049621582,
+      "step": 584
+    },
+    {
+      "epoch": 0.25285868392664507,
+      "grad_norm": 0.6107586622238159,
+      "learning_rate": 0.00010512129380053907,
+      "loss": 1.0020402669906616,
+      "step": 586
+    },
+    {
+      "epoch": 0.2537216828478964,
+      "grad_norm": 0.6878048181533813,
+      "learning_rate": 0.0001054806828391734,
+      "loss": 0.961039662361145,
+      "step": 588
+    },
+    {
+      "epoch": 0.2545846817691478,
+      "grad_norm": 0.664034903049469,
+      "learning_rate": 0.00010584007187780773,
+      "loss": 0.9725209474563599,
+      "step": 590
+    },
+    {
+      "epoch": 0.25544768069039914,
+      "grad_norm": 0.6399680376052856,
+      "learning_rate": 0.00010619946091644206,
+      "loss": 0.9907437562942505,
+      "step": 592
+    },
+    {
+      "epoch": 0.2563106796116505,
+      "grad_norm": 0.6163286566734314,
+      "learning_rate": 0.00010655884995507636,
+      "loss": 0.9650095701217651,
+      "step": 594
+    },
+    {
+      "epoch": 0.25717367853290185,
+      "grad_norm": 0.6008322238922119,
+      "learning_rate": 0.0001069182389937107,
+      "loss": 1.0102758407592773,
+      "step": 596
+    },
+    {
+      "epoch": 0.2580366774541532,
+      "grad_norm": 0.6752071380615234,
+      "learning_rate": 0.00010727762803234502,
+      "loss": 0.9101885557174683,
+      "step": 598
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "grad_norm": 0.6789175868034363,
+      "learning_rate": 0.00010763701707097935,
+      "loss": 1.0461398363113403,
+      "step": 600
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "eval_loss": 1.021111011505127,
+      "eval_runtime": 648.1611,
+      "eval_samples_per_second": 3.178,
+      "eval_steps_per_second": 3.178,
+      "step": 600
+    },
+    {
+      "epoch": 0.25976267529665586,
+      "grad_norm": 0.5660730600357056,
+      "learning_rate": 0.00010799640610961366,
+      "loss": 0.9582418203353882,
+      "step": 602
+    },
+    {
+      "epoch": 0.2606256742179072,
+      "grad_norm": 0.6726544499397278,
+      "learning_rate": 0.00010835579514824799,
+      "loss": 1.0763746500015259,
+      "step": 604
+    },
+    {
+      "epoch": 0.2614886731391586,
+      "grad_norm": 0.6068508625030518,
+      "learning_rate": 0.00010871518418688232,
+      "loss": 1.0432032346725464,
+      "step": 606
+    },
+    {
+      "epoch": 0.26235167206040994,
+      "grad_norm": 0.5731637477874756,
+      "learning_rate": 0.00010907457322551662,
+      "loss": 0.9830516576766968,
+      "step": 608
+    },
+    {
+      "epoch": 0.2632146709816613,
+      "grad_norm": 0.6777567267417908,
+      "learning_rate": 0.00010943396226415095,
+      "loss": 1.0442042350769043,
+      "step": 610
+    },
+    {
+      "epoch": 0.26407766990291265,
+      "grad_norm": 0.6372506618499756,
+      "learning_rate": 0.00010979335130278528,
+      "loss": 1.0001944303512573,
+      "step": 612
+    },
+    {
+      "epoch": 0.26494066882416395,
+      "grad_norm": 0.6606221795082092,
+      "learning_rate": 0.0001101527403414196,
+      "loss": 1.035884141921997,
+      "step": 614
+    },
+    {
+      "epoch": 0.2658036677454153,
+      "grad_norm": 0.6083229780197144,
+      "learning_rate": 0.00011051212938005391,
+      "loss": 0.9403397440910339,
+      "step": 616
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.6318517923355103,
+      "learning_rate": 0.00011087151841868823,
+      "loss": 0.9274454116821289,
+      "step": 618
+    },
+    {
+      "epoch": 0.267529665587918,
+      "grad_norm": 0.628190279006958,
+      "learning_rate": 0.00011123090745732256,
+      "loss": 0.9883754253387451,
+      "step": 620
+    },
+    {
+      "epoch": 0.2683926645091694,
+      "grad_norm": 0.5961961150169373,
+      "learning_rate": 0.00011159029649595688,
+      "loss": 1.0317535400390625,
+      "step": 622
+    },
+    {
+      "epoch": 0.26925566343042073,
+      "grad_norm": 0.5995341539382935,
+      "learning_rate": 0.00011194968553459119,
+      "loss": 0.9776126742362976,
+      "step": 624
+    },
+    {
+      "epoch": 0.27011866235167203,
+      "grad_norm": 0.6639334559440613,
+      "learning_rate": 0.00011230907457322552,
+      "loss": 1.0112378597259521,
+      "step": 626
+    },
+    {
+      "epoch": 0.2709816612729234,
+      "grad_norm": 0.6348621249198914,
+      "learning_rate": 0.00011266846361185985,
+      "loss": 1.0553804636001587,
+      "step": 628
+    },
+    {
+      "epoch": 0.27184466019417475,
+      "grad_norm": 0.5929805040359497,
+      "learning_rate": 0.00011302785265049416,
+      "loss": 1.025888442993164,
+      "step": 630
+    },
+    {
+      "epoch": 0.2727076591154261,
+      "grad_norm": 0.6052366495132446,
+      "learning_rate": 0.00011338724168912849,
+      "loss": 1.02956223487854,
+      "step": 632
+    },
+    {
+      "epoch": 0.27357065803667746,
+      "grad_norm": 0.6494882106781006,
+      "learning_rate": 0.00011374663072776282,
+      "loss": 0.989752471446991,
+      "step": 634
+    },
+    {
+      "epoch": 0.2744336569579288,
+      "grad_norm": 0.6005767583847046,
+      "learning_rate": 0.00011410601976639712,
+      "loss": 1.0030683279037476,
+      "step": 636
+    },
+    {
+      "epoch": 0.2752966558791802,
+      "grad_norm": 0.6478356122970581,
+      "learning_rate": 0.00011446540880503145,
+      "loss": 1.002000093460083,
+      "step": 638
+    },
+    {
+      "epoch": 0.2761596548004315,
+      "grad_norm": 0.5804725289344788,
+      "learning_rate": 0.00011482479784366578,
+      "loss": 0.9807654023170471,
+      "step": 640
+    },
+    {
+      "epoch": 0.27702265372168283,
+      "grad_norm": 0.632530689239502,
+      "learning_rate": 0.00011518418688230011,
+      "loss": 0.9841892123222351,
+      "step": 642
+    },
+    {
+      "epoch": 0.2778856526429342,
+      "grad_norm": 0.5340113639831543,
+      "learning_rate": 0.00011554357592093441,
+      "loss": 0.8902478814125061,
+      "step": 644
+    },
+    {
+      "epoch": 0.27874865156418555,
+      "grad_norm": 0.5901665091514587,
+      "learning_rate": 0.00011590296495956874,
+      "loss": 0.9019404649734497,
+      "step": 646
+    },
+    {
+      "epoch": 0.2796116504854369,
+      "grad_norm": 0.666589617729187,
+      "learning_rate": 0.00011626235399820307,
+      "loss": 0.9384423494338989,
+      "step": 648
+    },
+    {
+      "epoch": 0.28047464940668826,
+      "grad_norm": 0.7000334858894348,
+      "learning_rate": 0.00011662174303683737,
+      "loss": 1.0666629076004028,
+      "step": 650
+    },
+    {
+      "epoch": 0.2813376483279396,
+      "grad_norm": 0.663663923740387,
+      "learning_rate": 0.0001169811320754717,
+      "loss": 1.000019907951355,
+      "step": 652
+    },
+    {
+      "epoch": 0.2822006472491909,
+      "grad_norm": 0.6097694039344788,
+      "learning_rate": 0.00011734052111410603,
+      "loss": 0.9450293183326721,
+      "step": 654
+    },
+    {
+      "epoch": 0.2830636461704423,
+      "grad_norm": 0.6130967140197754,
+      "learning_rate": 0.00011769991015274035,
+      "loss": 0.9480894207954407,
+      "step": 656
+    },
+    {
+      "epoch": 0.28392664509169363,
+      "grad_norm": 0.7091249227523804,
+      "learning_rate": 0.00011805929919137467,
+      "loss": 1.1377143859863281,
+      "step": 658
+    },
+    {
+      "epoch": 0.284789644012945,
+      "grad_norm": 0.6556766629219055,
+      "learning_rate": 0.00011841868823000898,
+      "loss": 0.9421243667602539,
+      "step": 660
+    },
+    {
+      "epoch": 0.28565264293419634,
+      "grad_norm": 0.6682968735694885,
+      "learning_rate": 0.00011877807726864331,
+      "loss": 0.9726828336715698,
+      "step": 662
+    },
+    {
+      "epoch": 0.2865156418554477,
+      "grad_norm": 0.5224708914756775,
+      "learning_rate": 0.00011913746630727762,
+      "loss": 0.8996511697769165,
+      "step": 664
+    },
+    {
+      "epoch": 0.287378640776699,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 0.00011949685534591195,
+      "loss": 0.9679517149925232,
+      "step": 666
+    },
+    {
+      "epoch": 0.28824163969795036,
+      "grad_norm": 0.6175519824028015,
+      "learning_rate": 0.00011985624438454628,
+      "loss": 0.8743209838867188,
+      "step": 668
+    },
+    {
+      "epoch": 0.2891046386192017,
+      "grad_norm": 0.6019226312637329,
+      "learning_rate": 0.0001202156334231806,
+      "loss": 0.9741992354393005,
+      "step": 670
+    },
+    {
+      "epoch": 0.28996763754045307,
+      "grad_norm": 0.6080542206764221,
+      "learning_rate": 0.00012057502246181491,
+      "loss": 0.9516472816467285,
+      "step": 672
+    },
+    {
+      "epoch": 0.29083063646170443,
+      "grad_norm": 0.5885615944862366,
+      "learning_rate": 0.00012093441150044924,
+      "loss": 1.122761607170105,
+      "step": 674
+    },
+    {
+      "epoch": 0.2916936353829558,
+      "grad_norm": 0.6635209918022156,
+      "learning_rate": 0.00012129380053908357,
+      "loss": 1.0105189085006714,
+      "step": 676
+    },
+    {
+      "epoch": 0.29255663430420714,
+      "grad_norm": 0.5805009007453918,
+      "learning_rate": 0.0001216531895777179,
+      "loss": 0.906292200088501,
+      "step": 678
+    },
+    {
+      "epoch": 0.29341963322545844,
+      "grad_norm": 0.5980029702186584,
+      "learning_rate": 0.0001220125786163522,
+      "loss": 1.009568691253662,
+      "step": 680
+    },
+    {
+      "epoch": 0.2942826321467098,
+      "grad_norm": 0.6797705292701721,
+      "learning_rate": 0.00012237196765498652,
+      "loss": 1.0373667478561401,
+      "step": 682
+    },
+    {
+      "epoch": 0.29514563106796116,
+      "grad_norm": 0.6280547976493835,
+      "learning_rate": 0.00012273135669362085,
+      "loss": 0.9758188724517822,
+      "step": 684
+    },
+    {
+      "epoch": 0.2960086299892125,
+      "grad_norm": 0.511608898639679,
+      "learning_rate": 0.00012309074573225515,
+      "loss": 0.9111692905426025,
+      "step": 686
+    },
+    {
+      "epoch": 0.29687162891046387,
+      "grad_norm": 0.5781835317611694,
+      "learning_rate": 0.00012345013477088948,
+      "loss": 0.8865921497344971,
+      "step": 688
+    },
+    {
+      "epoch": 0.2977346278317152,
+      "grad_norm": 0.6514166593551636,
+      "learning_rate": 0.0001238095238095238,
+      "loss": 0.9768189191818237,
+      "step": 690
+    },
+    {
+      "epoch": 0.2985976267529666,
+      "grad_norm": 0.6109189987182617,
+      "learning_rate": 0.00012416891284815814,
+      "loss": 0.9991607069969177,
+      "step": 692
+    },
+    {
+      "epoch": 0.2994606256742179,
+      "grad_norm": 0.6598902344703674,
+      "learning_rate": 0.00012452830188679244,
+      "loss": 0.9548360705375671,
+      "step": 694
+    },
+    {
+      "epoch": 0.30032362459546924,
+      "grad_norm": 0.5633156895637512,
+      "learning_rate": 0.00012488769092542677,
+      "loss": 0.992988109588623,
+      "step": 696
+    },
+    {
+      "epoch": 0.3011866235167206,
+      "grad_norm": 0.6098802089691162,
+      "learning_rate": 0.0001252470799640611,
+      "loss": 0.9709890484809875,
+      "step": 698
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "grad_norm": 0.6197102665901184,
+      "learning_rate": 0.0001256064690026954,
+      "loss": 1.018282175064087,
+      "step": 700
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "eval_loss": 1.0030721426010132,
+      "eval_runtime": 655.4533,
+      "eval_samples_per_second": 3.143,
+      "eval_steps_per_second": 3.143,
+      "step": 700
+    },
+    {
+      "epoch": 0.3029126213592233,
+      "grad_norm": 0.5817480683326721,
+      "learning_rate": 0.00012596585804132974,
+      "loss": 0.9147283434867859,
+      "step": 702
+    },
+    {
+      "epoch": 0.30377562028047467,
+      "grad_norm": 0.5976696014404297,
+      "learning_rate": 0.00012632524707996407,
+      "loss": 0.9318362474441528,
+      "step": 704
+    },
+    {
+      "epoch": 0.304638619201726,
+      "grad_norm": 0.6389723420143127,
+      "learning_rate": 0.0001266846361185984,
+      "loss": 0.9500927925109863,
+      "step": 706
+    },
+    {
+      "epoch": 0.3055016181229773,
+      "grad_norm": 0.6485719084739685,
+      "learning_rate": 0.0001270440251572327,
+      "loss": 1.0271424055099487,
+      "step": 708
+    },
+    {
+      "epoch": 0.3063646170442287,
+      "grad_norm": 0.5802455544471741,
+      "learning_rate": 0.00012740341419586703,
+      "loss": 0.9781906008720398,
+      "step": 710
+    },
+    {
+      "epoch": 0.30722761596548004,
+      "grad_norm": 0.6359356641769409,
+      "learning_rate": 0.00012776280323450136,
+      "loss": 1.0195324420928955,
+      "step": 712
+    },
+    {
+      "epoch": 0.3080906148867314,
+      "grad_norm": 0.5975426435470581,
+      "learning_rate": 0.00012812219227313566,
+      "loss": 0.9250738024711609,
+      "step": 714
+    },
+    {
+      "epoch": 0.30895361380798275,
+      "grad_norm": 0.643110454082489,
+      "learning_rate": 0.00012848158131177,
+      "loss": 0.9888015985488892,
+      "step": 716
+    },
+    {
+      "epoch": 0.3098166127292341,
+      "grad_norm": 0.6043205261230469,
+      "learning_rate": 0.00012884097035040432,
+      "loss": 0.9709514379501343,
+      "step": 718
+    },
+    {
+      "epoch": 0.3106796116504854,
+      "grad_norm": 0.5687094926834106,
+      "learning_rate": 0.00012920035938903865,
+      "loss": 1.0272964239120483,
+      "step": 720
+    },
+    {
+      "epoch": 0.31154261057173677,
+      "grad_norm": 0.5688400864601135,
+      "learning_rate": 0.00012955974842767296,
+      "loss": 0.9370370507240295,
+      "step": 722
+    },
+    {
+      "epoch": 0.3124056094929881,
+      "grad_norm": 0.5610610246658325,
+      "learning_rate": 0.00012991913746630729,
+      "loss": 0.9535608291625977,
+      "step": 724
+    },
+    {
+      "epoch": 0.3132686084142395,
+      "grad_norm": 0.6338257193565369,
+      "learning_rate": 0.00013027852650494162,
+      "loss": 1.0188907384872437,
+      "step": 726
+    },
+    {
+      "epoch": 0.31413160733549084,
+      "grad_norm": 0.5365633368492126,
+      "learning_rate": 0.00013063791554357592,
+      "loss": 0.9253716468811035,
+      "step": 728
+    },
+    {
+      "epoch": 0.3149946062567422,
+      "grad_norm": 0.5599163174629211,
+      "learning_rate": 0.00013099730458221025,
+      "loss": 0.8941492438316345,
+      "step": 730
+    },
+    {
+      "epoch": 0.31585760517799355,
+      "grad_norm": 0.6059780716896057,
+      "learning_rate": 0.00013135669362084458,
+      "loss": 0.9831459522247314,
+      "step": 732
+    },
+    {
+      "epoch": 0.31672060409924485,
+      "grad_norm": 0.5596494078636169,
+      "learning_rate": 0.0001317160826594789,
+      "loss": 0.9332310557365417,
+      "step": 734
+    },
+    {
+      "epoch": 0.3175836030204962,
+      "grad_norm": 0.5618010759353638,
+      "learning_rate": 0.0001320754716981132,
+      "loss": 0.9082580208778381,
+      "step": 736
+    },
+    {
+      "epoch": 0.31844660194174756,
+      "grad_norm": 0.6412109732627869,
+      "learning_rate": 0.00013243486073674754,
+      "loss": 1.008690357208252,
+      "step": 738
+    },
+    {
+      "epoch": 0.3193096008629989,
+      "grad_norm": 0.5742355585098267,
+      "learning_rate": 0.00013279424977538187,
+      "loss": 0.9597798585891724,
+      "step": 740
+    },
+    {
+      "epoch": 0.3201725997842503,
+      "grad_norm": 0.6470226645469666,
+      "learning_rate": 0.00013315363881401617,
+      "loss": 0.989331841468811,
+      "step": 742
+    },
+    {
+      "epoch": 0.32103559870550163,
+      "grad_norm": 0.5598039031028748,
+      "learning_rate": 0.0001335130278526505,
+      "loss": 0.8677343130111694,
+      "step": 744
+    },
+    {
+      "epoch": 0.321898597626753,
+      "grad_norm": 0.5441372990608215,
+      "learning_rate": 0.00013387241689128483,
+      "loss": 0.9462730288505554,
+      "step": 746
+    },
+    {
+      "epoch": 0.3227615965480043,
+      "grad_norm": 0.5858626365661621,
+      "learning_rate": 0.00013423180592991916,
+      "loss": 0.994694173336029,
+      "step": 748
+    },
+    {
+      "epoch": 0.32362459546925565,
+      "grad_norm": 0.511372447013855,
+      "learning_rate": 0.00013459119496855347,
+      "loss": 0.9387269616127014,
+      "step": 750
+    },
+    {
+      "epoch": 0.324487594390507,
+      "grad_norm": 0.47798457741737366,
+      "learning_rate": 0.0001349505840071878,
+      "loss": 0.9473881721496582,
+      "step": 752
+    },
+    {
+      "epoch": 0.32535059331175836,
+      "grad_norm": 0.5907022953033447,
+      "learning_rate": 0.0001353099730458221,
+      "loss": 0.9375183582305908,
+      "step": 754
+    },
+    {
+      "epoch": 0.3262135922330097,
+      "grad_norm": 0.618733286857605,
+      "learning_rate": 0.00013566936208445643,
+      "loss": 1.028738260269165,
+      "step": 756
+    },
+    {
+      "epoch": 0.3270765911542611,
+      "grad_norm": 0.5234512090682983,
+      "learning_rate": 0.00013602875112309076,
+      "loss": 0.9420192241668701,
+      "step": 758
+    },
+    {
+      "epoch": 0.32793959007551243,
+      "grad_norm": 0.7036319971084595,
+      "learning_rate": 0.00013638814016172506,
+      "loss": 1.0252270698547363,
+      "step": 760
+    },
+    {
+      "epoch": 0.32880258899676373,
+      "grad_norm": 0.5543172359466553,
+      "learning_rate": 0.0001367475292003594,
+      "loss": 0.8453778028488159,
+      "step": 762
+    },
+    {
+      "epoch": 0.3296655879180151,
+      "grad_norm": 0.5438711643218994,
+      "learning_rate": 0.0001371069182389937,
+      "loss": 0.8659937977790833,
+      "step": 764
+    },
+    {
+      "epoch": 0.33052858683926645,
+      "grad_norm": 0.6390914916992188,
+      "learning_rate": 0.00013746630727762803,
+      "loss": 1.038142442703247,
+      "step": 766
+    },
+    {
+      "epoch": 0.3313915857605178,
+      "grad_norm": 0.50070720911026,
+      "learning_rate": 0.00013782569631626236,
+      "loss": 0.899932861328125,
+      "step": 768
+    },
+    {
+      "epoch": 0.33225458468176916,
+      "grad_norm": 0.5982286334037781,
+      "learning_rate": 0.00013818508535489669,
+      "loss": 0.9712884426116943,
+      "step": 770
+    },
+    {
+      "epoch": 0.3331175836030205,
+      "grad_norm": 0.6588822603225708,
+      "learning_rate": 0.000138544474393531,
+      "loss": 0.9427542686462402,
+      "step": 772
+    },
+    {
+      "epoch": 0.3339805825242718,
+      "grad_norm": 0.6022042632102966,
+      "learning_rate": 0.00013890386343216532,
+      "loss": 0.8961561918258667,
+      "step": 774
+    },
+    {
+      "epoch": 0.3348435814455232,
+      "grad_norm": 0.6595642566680908,
+      "learning_rate": 0.00013926325247079965,
+      "loss": 0.9525937438011169,
+      "step": 776
+    },
+    {
+      "epoch": 0.33570658036677453,
+      "grad_norm": 0.5210421681404114,
+      "learning_rate": 0.00013962264150943395,
+      "loss": 0.9218845367431641,
+      "step": 778
+    },
+    {
+      "epoch": 0.3365695792880259,
+      "grad_norm": 0.549669623374939,
+      "learning_rate": 0.00013998203054806828,
+      "loss": 0.877951443195343,
+      "step": 780
+    },
+    {
+      "epoch": 0.33743257820927725,
+      "grad_norm": 0.5360157489776611,
+      "learning_rate": 0.0001403414195867026,
+      "loss": 0.8670064210891724,
+      "step": 782
+    },
+    {
+      "epoch": 0.3382955771305286,
+      "grad_norm": 0.614734947681427,
+      "learning_rate": 0.00014070080862533694,
+      "loss": 0.9561367630958557,
+      "step": 784
+    },
+    {
+      "epoch": 0.33915857605177996,
+      "grad_norm": 0.5798251628875732,
+      "learning_rate": 0.00014106019766397124,
+      "loss": 0.9132505059242249,
+      "step": 786
+    },
+    {
+      "epoch": 0.34002157497303126,
+      "grad_norm": 0.6267077326774597,
+      "learning_rate": 0.00014141958670260557,
+      "loss": 0.9297707080841064,
+      "step": 788
+    },
+    {
+      "epoch": 0.3408845738942826,
+      "grad_norm": 0.6045349836349487,
+      "learning_rate": 0.0001417789757412399,
+      "loss": 0.9382412433624268,
+      "step": 790
+    },
+    {
+      "epoch": 0.341747572815534,
+      "grad_norm": 0.6125404834747314,
+      "learning_rate": 0.0001421383647798742,
+      "loss": 0.9078555107116699,
+      "step": 792
+    },
+    {
+      "epoch": 0.34261057173678533,
+      "grad_norm": 0.5927051901817322,
+      "learning_rate": 0.00014249775381850854,
+      "loss": 0.899101197719574,
+      "step": 794
+    },
+    {
+      "epoch": 0.3434735706580367,
+      "grad_norm": 0.6315743923187256,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.028346300125122,
+      "step": 796
+    },
+    {
+      "epoch": 0.34433656957928804,
+      "grad_norm": 0.549271285533905,
+      "learning_rate": 0.0001432165318957772,
+      "loss": 0.8988189697265625,
+      "step": 798
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "grad_norm": 0.6344245672225952,
+      "learning_rate": 0.0001435759209344115,
+      "loss": 1.0489003658294678,
+      "step": 800
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "eval_loss": 0.9864639639854431,
+      "eval_runtime": 667.3516,
+      "eval_samples_per_second": 3.087,
+      "eval_steps_per_second": 3.087,
+      "step": 800
+    },
+    {
+      "epoch": 0.3460625674217907,
+      "grad_norm": 0.5625309348106384,
+      "learning_rate": 0.00014393530997304583,
+      "loss": 0.8773928284645081,
+      "step": 802
+    },
+    {
+      "epoch": 0.34692556634304206,
+      "grad_norm": 0.5931969285011292,
+      "learning_rate": 0.00014429469901168016,
+      "loss": 0.9116050004959106,
+      "step": 804
+    },
+    {
+      "epoch": 0.3477885652642934,
+      "grad_norm": 0.5189821720123291,
+      "learning_rate": 0.00014465408805031446,
+      "loss": 0.9124425649642944,
+      "step": 806
+    },
+    {
+      "epoch": 0.34865156418554477,
+      "grad_norm": 0.5392254590988159,
+      "learning_rate": 0.0001450134770889488,
+      "loss": 0.9517888426780701,
+      "step": 808
+    },
+    {
+      "epoch": 0.34951456310679613,
+      "grad_norm": 0.5584444999694824,
+      "learning_rate": 0.00014537286612758312,
+      "loss": 0.9947572350502014,
+      "step": 810
+    },
+    {
+      "epoch": 0.3503775620280475,
+      "grad_norm": 0.5188854932785034,
+      "learning_rate": 0.00014573225516621745,
+      "loss": 0.9314022660255432,
+      "step": 812
+    },
+    {
+      "epoch": 0.3512405609492988,
+      "grad_norm": 0.5783659815788269,
+      "learning_rate": 0.00014609164420485176,
+      "loss": 0.9135628938674927,
+      "step": 814
+    },
+    {
+      "epoch": 0.35210355987055014,
+      "grad_norm": 0.550959050655365,
+      "learning_rate": 0.0001464510332434861,
+      "loss": 0.9665075540542603,
+      "step": 816
+    },
+    {
+      "epoch": 0.3529665587918015,
+      "grad_norm": 0.6013346314430237,
+      "learning_rate": 0.00014681042228212042,
+      "loss": 0.9836555123329163,
+      "step": 818
+    },
+    {
+      "epoch": 0.35382955771305286,
+      "grad_norm": 0.49219194054603577,
+      "learning_rate": 0.00014716981132075472,
+      "loss": 0.8900108337402344,
+      "step": 820
+    },
+    {
+      "epoch": 0.3546925566343042,
+      "grad_norm": 0.5517411828041077,
+      "learning_rate": 0.00014752920035938905,
+      "loss": 0.8769304156303406,
+      "step": 822
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.6062695980072021,
+      "learning_rate": 0.00014788858939802338,
+      "loss": 0.9744759202003479,
+      "step": 824
+    },
+    {
+      "epoch": 0.3564185544768069,
+      "grad_norm": 0.5132041573524475,
+      "learning_rate": 0.0001482479784366577,
+      "loss": 0.8875447511672974,
+      "step": 826
+    },
+    {
+      "epoch": 0.3572815533980582,
+      "grad_norm": 0.551799476146698,
+      "learning_rate": 0.000148607367475292,
+      "loss": 0.962710440158844,
+      "step": 828
+    },
+    {
+      "epoch": 0.3581445523193096,
+      "grad_norm": 0.6046625971794128,
+      "learning_rate": 0.00014896675651392634,
+      "loss": 0.8997528553009033,
+      "step": 830
+    },
+    {
+      "epoch": 0.35900755124056094,
+      "grad_norm": 0.560025691986084,
+      "learning_rate": 0.00014932614555256067,
+      "loss": 0.9541417360305786,
+      "step": 832
+    },
+    {
+      "epoch": 0.3598705501618123,
+      "grad_norm": 0.6441047787666321,
+      "learning_rate": 0.00014968553459119498,
+      "loss": 0.907791018486023,
+      "step": 834
+    },
+    {
+      "epoch": 0.36073354908306365,
+      "grad_norm": 0.5636281967163086,
+      "learning_rate": 0.0001500449236298293,
+      "loss": 1.0295937061309814,
+      "step": 836
+    },
+    {
+      "epoch": 0.361596548004315,
+      "grad_norm": 0.5528303384780884,
+      "learning_rate": 0.0001504043126684636,
+      "loss": 0.8875265717506409,
+      "step": 838
+    },
+    {
+      "epoch": 0.36245954692556637,
+      "grad_norm": 0.5345163345336914,
+      "learning_rate": 0.00015076370170709794,
+      "loss": 0.9678915739059448,
+      "step": 840
+    },
+    {
+      "epoch": 0.36332254584681767,
+      "grad_norm": 0.5551225543022156,
+      "learning_rate": 0.00015112309074573224,
+      "loss": 0.9235162734985352,
+      "step": 842
+    },
+    {
+      "epoch": 0.364185544768069,
+      "grad_norm": 0.5131904482841492,
+      "learning_rate": 0.00015148247978436657,
+      "loss": 0.8624292016029358,
+      "step": 844
+    },
+    {
+      "epoch": 0.3650485436893204,
+      "grad_norm": 0.6811004281044006,
+      "learning_rate": 0.0001518418688230009,
+      "loss": 1.0360193252563477,
+      "step": 846
+    },
+    {
+      "epoch": 0.36591154261057174,
+      "grad_norm": 0.6409741640090942,
+      "learning_rate": 0.00015220125786163523,
+      "loss": 0.9254010915756226,
+      "step": 848
+    },
+    {
+      "epoch": 0.3667745415318231,
+      "grad_norm": 0.5534068942070007,
+      "learning_rate": 0.00015256064690026953,
+      "loss": 0.8900630474090576,
+      "step": 850
+    },
+    {
+      "epoch": 0.36763754045307445,
+      "grad_norm": 0.4999487102031708,
+      "learning_rate": 0.00015292003593890386,
+      "loss": 0.88521409034729,
+      "step": 852
+    },
+    {
+      "epoch": 0.3685005393743258,
+      "grad_norm": 0.5805923938751221,
+      "learning_rate": 0.0001532794249775382,
+      "loss": 0.9563921093940735,
+      "step": 854
+    },
+    {
+      "epoch": 0.3693635382955771,
+      "grad_norm": 0.5485470294952393,
+      "learning_rate": 0.0001536388140161725,
+      "loss": 0.8909372687339783,
+      "step": 856
+    },
+    {
+      "epoch": 0.37022653721682847,
+      "grad_norm": 0.5317923426628113,
+      "learning_rate": 0.00015399820305480683,
+      "loss": 0.9145731925964355,
+      "step": 858
+    },
+    {
+      "epoch": 0.3710895361380798,
+      "grad_norm": 0.6073495745658875,
+      "learning_rate": 0.00015435759209344116,
+      "loss": 1.01466965675354,
+      "step": 860
+    },
+    {
+      "epoch": 0.3719525350593312,
+      "grad_norm": 0.566655158996582,
+      "learning_rate": 0.0001547169811320755,
+      "loss": 0.9941825270652771,
+      "step": 862
+    },
+    {
+      "epoch": 0.37281553398058254,
+      "grad_norm": 0.5262459516525269,
+      "learning_rate": 0.0001550763701707098,
+      "loss": 1.0059782266616821,
+      "step": 864
+    },
+    {
+      "epoch": 0.3736785329018339,
+      "grad_norm": 0.6264083981513977,
+      "learning_rate": 0.00015543575920934412,
+      "loss": 1.0332856178283691,
+      "step": 866
+    },
+    {
+      "epoch": 0.3745415318230852,
+      "grad_norm": 0.6575480699539185,
+      "learning_rate": 0.00015579514824797845,
+      "loss": 1.022459626197815,
+      "step": 868
+    },
+    {
+      "epoch": 0.37540453074433655,
+      "grad_norm": 0.6291940212249756,
+      "learning_rate": 0.00015615453728661275,
+      "loss": 0.9550372362136841,
+      "step": 870
+    },
+    {
+      "epoch": 0.3762675296655879,
+      "grad_norm": 0.6710562109947205,
+      "learning_rate": 0.00015651392632524708,
+      "loss": 0.9861716628074646,
+      "step": 872
+    },
+    {
+      "epoch": 0.37713052858683926,
+      "grad_norm": 0.5505748987197876,
+      "learning_rate": 0.0001568733153638814,
+      "loss": 0.9719111919403076,
+      "step": 874
+    },
+    {
+      "epoch": 0.3779935275080906,
+      "grad_norm": 0.5055180788040161,
+      "learning_rate": 0.00015723270440251574,
+      "loss": 0.8698170185089111,
+      "step": 876
+    },
+    {
+      "epoch": 0.378856526429342,
+      "grad_norm": 0.5935947895050049,
+      "learning_rate": 0.00015759209344115005,
+      "loss": 1.029494285583496,
+      "step": 878
+    },
+    {
+      "epoch": 0.37971952535059333,
+      "grad_norm": 0.538325846195221,
+      "learning_rate": 0.00015795148247978438,
+      "loss": 0.923010528087616,
+      "step": 880
+    },
+    {
+      "epoch": 0.38058252427184464,
+      "grad_norm": 0.587297797203064,
+      "learning_rate": 0.0001583108715184187,
+      "loss": 0.9394056797027588,
+      "step": 882
+    },
+    {
+      "epoch": 0.381445523193096,
+      "grad_norm": 0.5910462737083435,
+      "learning_rate": 0.000158670260557053,
+      "loss": 0.9472483992576599,
+      "step": 884
+    },
+    {
+      "epoch": 0.38230852211434735,
+      "grad_norm": 0.629048764705658,
+      "learning_rate": 0.00015902964959568734,
+      "loss": 0.9028263688087463,
+      "step": 886
+    },
+    {
+      "epoch": 0.3831715210355987,
+      "grad_norm": 0.5028086304664612,
+      "learning_rate": 0.00015938903863432167,
+      "loss": 0.9579087495803833,
+      "step": 888
+    },
+    {
+      "epoch": 0.38403451995685006,
+      "grad_norm": 0.5372384190559387,
+      "learning_rate": 0.000159748427672956,
+      "loss": 0.8318673372268677,
+      "step": 890
+    },
+    {
+      "epoch": 0.3848975188781014,
+      "grad_norm": 0.6314184665679932,
+      "learning_rate": 0.0001601078167115903,
+      "loss": 0.9804943203926086,
+      "step": 892
+    },
+    {
+      "epoch": 0.3857605177993528,
+      "grad_norm": 0.5545229911804199,
+      "learning_rate": 0.00016046720575022463,
+      "loss": 1.0078438520431519,
+      "step": 894
+    },
+    {
+      "epoch": 0.3866235167206041,
+      "grad_norm": 0.4674014151096344,
+      "learning_rate": 0.00016082659478885896,
+      "loss": 0.9269036650657654,
+      "step": 896
+    },
+    {
+      "epoch": 0.38748651564185543,
+      "grad_norm": 1.5887153148651123,
+      "learning_rate": 0.00016118598382749326,
+      "loss": 0.8927953243255615,
+      "step": 898
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "grad_norm": 0.5217035412788391,
+      "learning_rate": 0.0001615453728661276,
+      "loss": 0.908074140548706,
+      "step": 900
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "eval_loss": 0.9741895794868469,
+      "eval_runtime": 667.2236,
+      "eval_samples_per_second": 3.087,
+      "eval_steps_per_second": 3.087,
+      "step": 900
+    },
+    {
+      "epoch": 0.38921251348435815,
+      "grad_norm": 0.470498651266098,
+      "learning_rate": 0.00016190476190476192,
+      "loss": 0.9660369157791138,
+      "step": 902
+    },
+    {
+      "epoch": 0.3900755124056095,
+      "grad_norm": 0.5111004114151001,
+      "learning_rate": 0.00016226415094339625,
+      "loss": 0.9236379265785217,
+      "step": 904
+    },
+    {
+      "epoch": 0.39093851132686086,
+      "grad_norm": 0.5872815251350403,
+      "learning_rate": 0.00016262353998203056,
+      "loss": 1.0061595439910889,
+      "step": 906
+    },
+    {
+      "epoch": 0.3918015102481122,
+      "grad_norm": 0.5150740742683411,
+      "learning_rate": 0.0001629829290206649,
+      "loss": 0.8347328901290894,
+      "step": 908
+    },
+    {
+      "epoch": 0.3926645091693635,
+      "grad_norm": 0.46554985642433167,
+      "learning_rate": 0.00016334231805929922,
+      "loss": 0.9091183543205261,
+      "step": 910
+    },
+    {
+      "epoch": 0.3935275080906149,
+      "grad_norm": 0.5292875170707703,
+      "learning_rate": 0.00016370170709793352,
+      "loss": 0.9299798011779785,
+      "step": 912
+    },
+    {
+      "epoch": 0.39439050701186623,
+      "grad_norm": 0.5177125930786133,
+      "learning_rate": 0.00016406109613656785,
+      "loss": 0.942286491394043,
+      "step": 914
+    },
+    {
+      "epoch": 0.3952535059331176,
+      "grad_norm": 0.5564161539077759,
+      "learning_rate": 0.00016442048517520215,
+      "loss": 0.825290858745575,
+      "step": 916
+    },
+    {
+      "epoch": 0.39611650485436894,
+      "grad_norm": 0.5572530031204224,
+      "learning_rate": 0.00016477987421383648,
+      "loss": 0.876898467540741,
+      "step": 918
+    },
+    {
+      "epoch": 0.3969795037756203,
+      "grad_norm": 0.7294673323631287,
+      "learning_rate": 0.0001651392632524708,
+      "loss": 0.8949798941612244,
+      "step": 920
+    },
+    {
+      "epoch": 0.3978425026968716,
+      "grad_norm": 0.5234251022338867,
+      "learning_rate": 0.00016549865229110512,
+      "loss": 0.8457819223403931,
+      "step": 922
+    },
+    {
+      "epoch": 0.39870550161812296,
+      "grad_norm": 0.5273709893226624,
+      "learning_rate": 0.00016585804132973945,
+      "loss": 0.9080174565315247,
+      "step": 924
+    },
+    {
+      "epoch": 0.3995685005393743,
+      "grad_norm": 0.5795063376426697,
+      "learning_rate": 0.00016621743036837378,
+      "loss": 1.0304023027420044,
+      "step": 926
+    },
+    {
+      "epoch": 0.4004314994606257,
+      "grad_norm": 0.6153313517570496,
+      "learning_rate": 0.00016657681940700808,
+      "loss": 0.8900477886199951,
+      "step": 928
+    },
+    {
+      "epoch": 0.40129449838187703,
+      "grad_norm": 0.6293173432350159,
+      "learning_rate": 0.0001669362084456424,
+      "loss": 1.0130009651184082,
+      "step": 930
+    },
+    {
+      "epoch": 0.4021574973031284,
+      "grad_norm": 0.5455223321914673,
+      "learning_rate": 0.00016729559748427674,
+      "loss": 0.9339282512664795,
+      "step": 932
+    },
+    {
+      "epoch": 0.40302049622437974,
+      "grad_norm": 0.5349094271659851,
+      "learning_rate": 0.00016765498652291104,
+      "loss": 0.9628980755805969,
+      "step": 934
+    },
+    {
+      "epoch": 0.40388349514563104,
+      "grad_norm": 0.491227924823761,
+      "learning_rate": 0.00016801437556154537,
+      "loss": 0.8922860026359558,
+      "step": 936
+    },
+    {
+      "epoch": 0.4047464940668824,
+      "grad_norm": 0.6331246495246887,
+      "learning_rate": 0.0001683737646001797,
+      "loss": 1.0470497608184814,
+      "step": 938
+    },
+    {
+      "epoch": 0.40560949298813376,
+      "grad_norm": 0.6079246401786804,
+      "learning_rate": 0.00016873315363881403,
+      "loss": 0.8868283629417419,
+      "step": 940
+    },
+    {
+      "epoch": 0.4064724919093851,
+      "grad_norm": 0.5326972603797913,
+      "learning_rate": 0.00016909254267744833,
+      "loss": 0.9938711524009705,
+      "step": 942
+    },
+    {
+      "epoch": 0.40733549083063647,
+      "grad_norm": 0.47754305601119995,
+      "learning_rate": 0.00016945193171608266,
+      "loss": 0.8280484676361084,
+      "step": 944
+    },
+    {
+      "epoch": 0.4081984897518878,
+      "grad_norm": 0.6683310270309448,
+      "learning_rate": 0.000169811320754717,
+      "loss": 1.089701533317566,
+      "step": 946
+    },
+    {
+      "epoch": 0.4090614886731392,
+      "grad_norm": 0.42798754572868347,
+      "learning_rate": 0.0001701707097933513,
+      "loss": 0.8535542488098145,
+      "step": 948
+    },
+    {
+      "epoch": 0.4099244875943905,
+      "grad_norm": 0.5999574065208435,
+      "learning_rate": 0.00017053009883198563,
+      "loss": 0.9039298295974731,
+      "step": 950
+    },
+    {
+      "epoch": 0.41078748651564184,
+      "grad_norm": 0.5752781629562378,
+      "learning_rate": 0.00017088948787061996,
+      "loss": 0.8786448240280151,
+      "step": 952
+    },
+    {
+      "epoch": 0.4116504854368932,
+      "grad_norm": 0.5121532678604126,
+      "learning_rate": 0.0001712488769092543,
+      "loss": 0.9206072688102722,
+      "step": 954
+    },
+    {
+      "epoch": 0.41251348435814456,
+      "grad_norm": 0.611078143119812,
+      "learning_rate": 0.0001716082659478886,
+      "loss": 0.9246986508369446,
+      "step": 956
+    },
+    {
+      "epoch": 0.4133764832793959,
+      "grad_norm": 0.5101020336151123,
+      "learning_rate": 0.00017196765498652292,
+      "loss": 0.9221894145011902,
+      "step": 958
+    },
+    {
+      "epoch": 0.41423948220064727,
+      "grad_norm": 0.5681450963020325,
+      "learning_rate": 0.00017232704402515725,
+      "loss": 0.9072799682617188,
+      "step": 960
+    },
+    {
+      "epoch": 0.41510248112189857,
+      "grad_norm": 0.47865498065948486,
+      "learning_rate": 0.00017268643306379155,
+      "loss": 0.9460896849632263,
+      "step": 962
+    },
+    {
+      "epoch": 0.4159654800431499,
+      "grad_norm": 0.49861401319503784,
+      "learning_rate": 0.00017304582210242588,
+      "loss": 0.9121519923210144,
+      "step": 964
+    },
+    {
+      "epoch": 0.4168284789644013,
+      "grad_norm": 0.43025892972946167,
+      "learning_rate": 0.0001734052111410602,
+      "loss": 0.8826848864555359,
+      "step": 966
+    },
+    {
+      "epoch": 0.41769147788565264,
+      "grad_norm": 0.4600491225719452,
+      "learning_rate": 0.00017376460017969454,
+      "loss": 0.8756251335144043,
+      "step": 968
+    },
+    {
+      "epoch": 0.418554476806904,
+      "grad_norm": 0.5297656059265137,
+      "learning_rate": 0.00017412398921832885,
+      "loss": 0.9171333312988281,
+      "step": 970
+    },
+    {
+      "epoch": 0.41941747572815535,
+      "grad_norm": 0.4906919002532959,
+      "learning_rate": 0.00017448337825696318,
+      "loss": 0.8887524008750916,
+      "step": 972
+    },
+    {
+      "epoch": 0.4202804746494067,
+      "grad_norm": 0.49263402819633484,
+      "learning_rate": 0.0001748427672955975,
+      "loss": 0.8345810174942017,
+      "step": 974
+    },
+    {
+      "epoch": 0.421143473570658,
+      "grad_norm": 0.5706565380096436,
+      "learning_rate": 0.0001752021563342318,
+      "loss": 0.968651294708252,
+      "step": 976
+    },
+    {
+      "epoch": 0.42200647249190937,
+      "grad_norm": 0.5269908308982849,
+      "learning_rate": 0.00017556154537286614,
+      "loss": 0.9729376435279846,
+      "step": 978
+    },
+    {
+      "epoch": 0.4228694714131607,
+      "grad_norm": 0.47058001160621643,
+      "learning_rate": 0.00017592093441150047,
+      "loss": 0.963884711265564,
+      "step": 980
+    },
+    {
+      "epoch": 0.4237324703344121,
+      "grad_norm": 0.5322962999343872,
+      "learning_rate": 0.0001762803234501348,
+      "loss": 0.8952447175979614,
+      "step": 982
+    },
+    {
+      "epoch": 0.42459546925566344,
+      "grad_norm": 0.5750975012779236,
+      "learning_rate": 0.0001766397124887691,
+      "loss": 0.8932783603668213,
+      "step": 984
+    },
+    {
+      "epoch": 0.4254584681769148,
+      "grad_norm": 0.5539655685424805,
+      "learning_rate": 0.00017699910152740343,
+      "loss": 0.916595458984375,
+      "step": 986
+    },
+    {
+      "epoch": 0.42632146709816615,
+      "grad_norm": 0.568000853061676,
+      "learning_rate": 0.00017735849056603776,
+      "loss": 0.9669626355171204,
+      "step": 988
+    },
+    {
+      "epoch": 0.42718446601941745,
+      "grad_norm": 0.6010684370994568,
+      "learning_rate": 0.00017771787960467206,
+      "loss": 1.0089105367660522,
+      "step": 990
+    },
+    {
+      "epoch": 0.4280474649406688,
+      "grad_norm": 0.6083462238311768,
+      "learning_rate": 0.0001780772686433064,
+      "loss": 0.9810921549797058,
+      "step": 992
+    },
+    {
+      "epoch": 0.42891046386192017,
+      "grad_norm": 0.5076655149459839,
+      "learning_rate": 0.0001784366576819407,
+      "loss": 0.9524372816085815,
+      "step": 994
+    },
+    {
+      "epoch": 0.4297734627831715,
+      "grad_norm": 0.5260922312736511,
+      "learning_rate": 0.00017879604672057503,
+      "loss": 0.881294846534729,
+      "step": 996
+    },
+    {
+      "epoch": 0.4306364617044229,
+      "grad_norm": 0.6130498051643372,
+      "learning_rate": 0.00017915543575920936,
+      "loss": 0.9138327836990356,
+      "step": 998
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "grad_norm": 0.5346242785453796,
+      "learning_rate": 0.00017951482479784366,
+      "loss": 0.8861367106437683,
+      "step": 1000
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "eval_loss": 0.9606748819351196,
+      "eval_runtime": 655.4358,
+      "eval_samples_per_second": 3.143,
+      "eval_steps_per_second": 3.143,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 13908,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.98992434435586e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

sft_qwen_14B/checkpoints/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-1500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-1500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-1500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d30694443628488e12d237aa3f10dc7201eb3cad1bd3f9502c86c23d3e61559d
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79236304328b73630e2b46bd5fd41cf80bc8c4ba971c80892b4ca8a2dd60ed90
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:062ed3c2bd7316800413ed3478ac76458e28f1148909e6811779e93c087c1f41
+size 14645

sft_qwen_14B/checkpoints/checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4170f3d34ca4a6959c04cb0d7d28946b661dbf270b5e56d1e31191350e1e65
+size 1465

sft_qwen_14B/checkpoints/checkpoint-1500/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sft_qwen_14B/checkpoints/checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-2000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-2000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-2000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4fd06ae0a5072195431c638f9fa0553bde7cbdef4450af22ee4920688ac7c1
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1169f222c822dab1e5ce598a86b466b0ac52e8232e29a6e897e87612c208a024
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceabeff68a5532e9645fa847866094307abb118a32832ab82c4e32244ed9ef1a
+size 14645

sft_qwen_14B/checkpoints/checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37edb23690c4b97b9dc4a4a7b4f3bb29738198651fbb5fb02cf420e84396a51b
+size 1465

sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sft_qwen_14B/checkpoints/checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-2500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-2500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-2500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be9580711ec346ace6d529831aa1885c08485e3841f57055b0bb9783a530ac14
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-2500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f48f0a61449acbf6cb595fa231b8e2a5d5a08d8a1ce8639b35311b5b95adb22
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-2500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62c6cf48875b5ffb30f6d449c0ad7793378c7b68f3e04492719377f35615e95c
+size 14645

sft_qwen_14B/checkpoints/checkpoint-2500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:891dcd76bbab61fc7167559c72d346bfecbc331339a7dc5622e34b2bde10b46a
+size 1465

sft_qwen_14B/checkpoints/checkpoint-2500/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sft_qwen_14B/checkpoints/checkpoint-2500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-3000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-3000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-3000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dcbf833b9436fe1633c8c06b79c62eb27315805c72c93a87cf400578cca73bb
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb6b77215c673d143314529902fe0e9c30346349f436c2b94e6287816a01931a
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8acb52ffaa9f33f79559f4745e1edd981724d4af3b215ee913bccc7a86ea9435
+size 14645

sft_qwen_14B/checkpoints/checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cbdba03d310bbbdcbbaef51ab8ce27fbdc8188e8f0659848963cbd94c6e7882
+size 1465

sft_qwen_14B/checkpoints/checkpoint-3000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sft_qwen_14B/checkpoints/checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4200830b23d19bc86049c280236c4a6b18c26d7061b5a57cc024888ec760920f
+size 5201

sft_qwen_14B/checkpoints/checkpoint-3500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: runs/cpt_run_14b/merged_14b_cpt_lora
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:runs/cpt_run_14b/merged_14b_cpt_lora
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.0

sft_qwen_14B/checkpoints/checkpoint-3500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "runs/cpt_run_14b/merged_14b_cpt_lora",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.0",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_qwen_14B/checkpoints/checkpoint-3500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e37c343dccf5c3a247692acad99797a99a1793a9cf387c8e79c5fda28b092489
+size 100715016

sft_qwen_14B/checkpoints/checkpoint-3500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab23160d343da52f73e074d13df36e94f9b1537165702ffffb28f4b3a03240be
+size 201650659

sft_qwen_14B/checkpoints/checkpoint-3500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c29f86833d3e2e183be7c52441966ecc0b80af7d1582cff4eabb3312f82d2029
+size 14645