jinghao57 commited on 22 days ago

Commit

e4155f1

verified ·

1 Parent(s): b5f123a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
checkpoint-2500/README.md +209 -0
checkpoint-2500/adapter_config.json +48 -0
checkpoint-2500/adapter_model.safetensors +3 -0
checkpoint-2500/added_tokens.json +24 -0
checkpoint-2500/chat_template.jinja +54 -0
checkpoint-2500/merges.txt +0 -0
checkpoint-2500/optimizer.pt +3 -0
checkpoint-2500/rng_state.pth +3 -0
checkpoint-2500/scheduler.pt +3 -0
checkpoint-2500/special_tokens_map.json +31 -0
checkpoint-2500/tokenizer.json +3 -0
checkpoint-2500/tokenizer_config.json +207 -0
checkpoint-2500/trainer_state.json +1824 -0
checkpoint-2500/training_args.bin +3 -0
checkpoint-2500/vocab.json +0 -0
checkpoint-3000/README.md +209 -0
checkpoint-3000/adapter_config.json +48 -0
checkpoint-3000/adapter_model.safetensors +3 -0
checkpoint-3000/added_tokens.json +24 -0
checkpoint-3000/chat_template.jinja +54 -0
checkpoint-3000/merges.txt +0 -0
checkpoint-3000/optimizer.pt +3 -0
checkpoint-3000/rng_state.pth +3 -0
checkpoint-3000/scheduler.pt +3 -0
checkpoint-3000/special_tokens_map.json +31 -0
checkpoint-3000/tokenizer.json +3 -0
checkpoint-3000/tokenizer_config.json +207 -0
checkpoint-3000/trainer_state.json +2182 -0
checkpoint-3000/training_args.bin +3 -0
checkpoint-3000/vocab.json +0 -0
checkpoint-3116/README.md +209 -0
checkpoint-3116/adapter_config.json +48 -0
checkpoint-3116/adapter_model.safetensors +3 -0
checkpoint-3116/added_tokens.json +24 -0
checkpoint-3116/chat_template.jinja +54 -0
checkpoint-3116/merges.txt +0 -0
checkpoint-3116/optimizer.pt +3 -0
checkpoint-3116/rng_state.pth +3 -0
checkpoint-3116/scheduler.pt +3 -0
checkpoint-3116/special_tokens_map.json +31 -0
checkpoint-3116/tokenizer.json +3 -0
checkpoint-3116/tokenizer_config.json +207 -0
checkpoint-3116/trainer_state.json +2259 -0
checkpoint-3116/training_args.bin +3 -0
checkpoint-3116/vocab.json +0 -0
final-adapter/README.md +209 -0
final-adapter/adapter_config.json +48 -0
final-adapter/adapter_model.safetensors +3 -0
final-adapter/added_tokens.json +24 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3116/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+final-adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-2500/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Coder-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Coder-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoint-2500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-2500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:949c1d6edc1607a91eef4b7f27590a11dda5846556d363f2417aa79a2321435b
+size 645975704

checkpoint-2500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-2500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-2500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03506de5040451c4109e8750d651f3f8e3d6efcd2a0199660dfb943ffa940a5e
+size 1292176234

checkpoint-2500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70ef42781c9db9bf4ed690495b7f5fa2abebe765ea384e6fcd4377affbe1199d
+size 14244

checkpoint-2500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f0a8594ada369c1380e2dc14e138837016aa89c29e5f6621f284d2ed284c72c
+size 1064

checkpoint-2500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83396048d512ec1f3178af0d7c1f79a226bba041822614b0e26a4fd2d4b55bf7
+size 11421995

checkpoint-2500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-2500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1824 @@

+{
+  "best_global_step": 2500,
+  "best_metric": 0.04088287055492401,
+  "best_model_checkpoint": "./checkpoints/checkpoint-2500",
+  "epoch": 1.6046940822467404,
+  "eval_steps": 500,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064192577733199595,
+      "grad_norm": 0.734896719455719,
+      "learning_rate": 1.9148936170212766e-05,
+      "loss": 1.2808,
+      "step": 10
+    },
+    {
+      "epoch": 0.012838515546639919,
+      "grad_norm": 0.5790244340896606,
+      "learning_rate": 4.0425531914893614e-05,
+      "loss": 1.065,
+      "step": 20
+    },
+    {
+      "epoch": 0.01925777331995988,
+      "grad_norm": 0.20778240263462067,
+      "learning_rate": 6.170212765957447e-05,
+      "loss": 0.6486,
+      "step": 30
+    },
+    {
+      "epoch": 0.025677031093279838,
+      "grad_norm": 0.17519450187683105,
+      "learning_rate": 8.297872340425533e-05,
+      "loss": 0.559,
+      "step": 40
+    },
+    {
+      "epoch": 0.0320962888665998,
+      "grad_norm": 0.2367718517780304,
+      "learning_rate": 0.00010425531914893618,
+      "loss": 0.4906,
+      "step": 50
+    },
+    {
+      "epoch": 0.03851554663991976,
+      "grad_norm": 0.24615059792995453,
+      "learning_rate": 0.00012553191489361702,
+      "loss": 0.4413,
+      "step": 60
+    },
+    {
+      "epoch": 0.04493480441323972,
+      "grad_norm": 0.22158223390579224,
+      "learning_rate": 0.00014680851063829788,
+      "loss": 0.386,
+      "step": 70
+    },
+    {
+      "epoch": 0.051354062186559676,
+      "grad_norm": 0.18299002945423126,
+      "learning_rate": 0.00016808510638297873,
+      "loss": 0.37,
+      "step": 80
+    },
+    {
+      "epoch": 0.05777331995987964,
+      "grad_norm": 0.483330100774765,
+      "learning_rate": 0.00018936170212765957,
+      "loss": 0.408,
+      "step": 90
+    },
+    {
+      "epoch": 0.0641925777331996,
+      "grad_norm": 0.36728036403656006,
+      "learning_rate": 0.00019999864911039267,
+      "loss": 0.3986,
+      "step": 100
+    },
+    {
+      "epoch": 0.07061183550651956,
+      "grad_norm": 0.23549625277519226,
+      "learning_rate": 0.00019998784221252132,
+      "loss": 0.3367,
+      "step": 110
+    },
+    {
+      "epoch": 0.07703109327983952,
+      "grad_norm": 0.2653771936893463,
+      "learning_rate": 0.0001999662295846848,
+      "loss": 0.3457,
+      "step": 120
+    },
+    {
+      "epoch": 0.08345035105315948,
+      "grad_norm": 0.21738414466381073,
+      "learning_rate": 0.0001999338135625693,
+      "loss": 0.3075,
+      "step": 130
+    },
+    {
+      "epoch": 0.08986960882647944,
+      "grad_norm": 0.2068450152873993,
+      "learning_rate": 0.00019989059764938857,
+      "loss": 0.3423,
+      "step": 140
+    },
+    {
+      "epoch": 0.09628886659979939,
+      "grad_norm": 0.24901063740253448,
+      "learning_rate": 0.00019983658651550522,
+      "loss": 0.2767,
+      "step": 150
+    },
+    {
+      "epoch": 0.10270812437311935,
+      "grad_norm": 0.34664201736450195,
+      "learning_rate": 0.00019977178599792623,
+      "loss": 0.297,
+      "step": 160
+    },
+    {
+      "epoch": 0.10912738214643931,
+      "grad_norm": 0.29314759373664856,
+      "learning_rate": 0.00019969620309967198,
+      "loss": 0.2975,
+      "step": 170
+    },
+    {
+      "epoch": 0.11554663991975928,
+      "grad_norm": 0.3220290541648865,
+      "learning_rate": 0.0001996098459890194,
+      "loss": 0.2717,
+      "step": 180
+    },
+    {
+      "epoch": 0.12196589769307924,
+      "grad_norm": 0.26825714111328125,
+      "learning_rate": 0.00019951272399861938,
+      "loss": 0.262,
+      "step": 190
+    },
+    {
+      "epoch": 0.1283851554663992,
+      "grad_norm": 0.2678840458393097,
+      "learning_rate": 0.00019940484762448794,
+      "loss": 0.2439,
+      "step": 200
+    },
+    {
+      "epoch": 0.13480441323971915,
+      "grad_norm": 0.26287031173706055,
+      "learning_rate": 0.00019928622852487216,
+      "loss": 0.2463,
+      "step": 210
+    },
+    {
+      "epoch": 0.14122367101303912,
+      "grad_norm": 0.2625996172428131,
+      "learning_rate": 0.00019915687951899025,
+      "loss": 0.2158,
+      "step": 220
+    },
+    {
+      "epoch": 0.14764292878635907,
+      "grad_norm": 0.21532030403614044,
+      "learning_rate": 0.00019901681458564592,
+      "loss": 0.2505,
+      "step": 230
+    },
+    {
+      "epoch": 0.15406218655967904,
+      "grad_norm": 0.18523766100406647,
+      "learning_rate": 0.00019886604886171797,
+      "loss": 0.2648,
+      "step": 240
+    },
+    {
+      "epoch": 0.160481444332999,
+      "grad_norm": 0.1750323474407196,
+      "learning_rate": 0.00019870459864052435,
+      "loss": 0.1999,
+      "step": 250
+    },
+    {
+      "epoch": 0.16690070210631897,
+      "grad_norm": 0.21571072936058044,
+      "learning_rate": 0.00019853248137006123,
+      "loss": 0.2513,
+      "step": 260
+    },
+    {
+      "epoch": 0.1733199598796389,
+      "grad_norm": 0.4137316942214966,
+      "learning_rate": 0.00019834971565111758,
+      "loss": 0.2365,
+      "step": 270
+    },
+    {
+      "epoch": 0.1797392176529589,
+      "grad_norm": 0.1867353618144989,
+      "learning_rate": 0.0001981563212352648,
+      "loss": 0.2136,
+      "step": 280
+    },
+    {
+      "epoch": 0.18615847542627884,
+      "grad_norm": 0.24344463646411896,
+      "learning_rate": 0.0001979523190227222,
+      "loss": 0.2051,
+      "step": 290
+    },
+    {
+      "epoch": 0.19257773319959878,
+      "grad_norm": 0.32625189423561096,
+      "learning_rate": 0.0001977377310600984,
+      "loss": 0.2142,
+      "step": 300
+    },
+    {
+      "epoch": 0.19899699097291876,
+      "grad_norm": 0.2645917236804962,
+      "learning_rate": 0.00019751258053800865,
+      "loss": 0.2169,
+      "step": 310
+    },
+    {
+      "epoch": 0.2054162487462387,
+      "grad_norm": 0.2912643849849701,
+      "learning_rate": 0.0001972768917885686,
+      "loss": 0.2276,
+      "step": 320
+    },
+    {
+      "epoch": 0.21183550651955868,
+      "grad_norm": 0.18374884128570557,
+      "learning_rate": 0.00019703069028276482,
+      "loss": 0.181,
+      "step": 330
+    },
+    {
+      "epoch": 0.21825476429287863,
+      "grad_norm": 0.22864000499248505,
+      "learning_rate": 0.000196774002627702,
+      "loss": 0.2036,
+      "step": 340
+    },
+    {
+      "epoch": 0.2246740220661986,
+      "grad_norm": 0.1839989423751831,
+      "learning_rate": 0.00019650685656372763,
+      "loss": 0.1646,
+      "step": 350
+    },
+    {
+      "epoch": 0.23109327983951855,
+      "grad_norm": 0.2861062288284302,
+      "learning_rate": 0.000196229280961434,
+      "loss": 0.1854,
+      "step": 360
+    },
+    {
+      "epoch": 0.23751253761283853,
+      "grad_norm": 0.19540704786777496,
+      "learning_rate": 0.00019594130581853823,
+      "loss": 0.1444,
+      "step": 370
+    },
+    {
+      "epoch": 0.24393179538615847,
+      "grad_norm": 0.33926671743392944,
+      "learning_rate": 0.0001956429622566403,
+      "loss": 0.1933,
+      "step": 380
+    },
+    {
+      "epoch": 0.2503510531594784,
+      "grad_norm": 0.3747280538082123,
+      "learning_rate": 0.00019533428251785983,
+      "loss": 0.1932,
+      "step": 390
+    },
+    {
+      "epoch": 0.2567703109327984,
+      "grad_norm": 0.1787070333957672,
+      "learning_rate": 0.00019501529996135156,
+      "loss": 0.1855,
+      "step": 400
+    },
+    {
+      "epoch": 0.26318956870611837,
+      "grad_norm": 0.21175768971443176,
+      "learning_rate": 0.00019468604905970033,
+      "loss": 0.2053,
+      "step": 410
+    },
+    {
+      "epoch": 0.2696088264794383,
+      "grad_norm": 0.22584153711795807,
+      "learning_rate": 0.00019434656539519548,
+      "loss": 0.1551,
+      "step": 420
+    },
+    {
+      "epoch": 0.27602808425275827,
+      "grad_norm": 0.20716825127601624,
+      "learning_rate": 0.00019399688565598547,
+      "loss": 0.1708,
+      "step": 430
+    },
+    {
+      "epoch": 0.28244734202607824,
+      "grad_norm": 0.21005718410015106,
+      "learning_rate": 0.0001936370476321132,
+      "loss": 0.1279,
+      "step": 440
+    },
+    {
+      "epoch": 0.2888665997993982,
+      "grad_norm": 0.209135502576828,
+      "learning_rate": 0.00019326709021143167,
+      "loss": 0.1584,
+      "step": 450
+    },
+    {
+      "epoch": 0.29528585757271814,
+      "grad_norm": 0.2232855260372162,
+      "learning_rate": 0.00019288705337540166,
+      "loss": 0.1562,
+      "step": 460
+    },
+    {
+      "epoch": 0.3017051153460381,
+      "grad_norm": 0.2212464064359665,
+      "learning_rate": 0.0001924969781947707,
+      "loss": 0.1733,
+      "step": 470
+    },
+    {
+      "epoch": 0.3081243731193581,
+      "grad_norm": 0.18040229380130768,
+      "learning_rate": 0.00019209690682513465,
+      "loss": 0.12,
+      "step": 480
+    },
+    {
+      "epoch": 0.31454363089267806,
+      "grad_norm": 0.16415567696094513,
+      "learning_rate": 0.0001916868825023819,
+      "loss": 0.1257,
+      "step": 490
+    },
+    {
+      "epoch": 0.320962888665998,
+      "grad_norm": 0.18324217200279236,
+      "learning_rate": 0.00019126694953802093,
+      "loss": 0.138,
+      "step": 500
+    },
+    {
+      "epoch": 0.320962888665998,
+      "eval_loss": 0.15367139875888824,
+      "eval_runtime": 127.7222,
+      "eval_samples_per_second": 10.296,
+      "eval_steps_per_second": 10.296,
+      "step": 500
+    },
+    {
+      "epoch": 0.32738214643931796,
+      "grad_norm": 0.15339615941047668,
+      "learning_rate": 0.00019083715331439134,
+      "loss": 0.1653,
+      "step": 510
+    },
+    {
+      "epoch": 0.33380140421263793,
+      "grad_norm": 0.2750616669654846,
+      "learning_rate": 0.00019039754027975952,
+      "loss": 0.1834,
+      "step": 520
+    },
+    {
+      "epoch": 0.34022066198595785,
+      "grad_norm": 0.20012134313583374,
+      "learning_rate": 0.00018994815794329896,
+      "loss": 0.1278,
+      "step": 530
+    },
+    {
+      "epoch": 0.3466399197592778,
+      "grad_norm": 0.24493156373500824,
+      "learning_rate": 0.0001894890548699559,
+      "loss": 0.142,
+      "step": 540
+    },
+    {
+      "epoch": 0.3530591775325978,
+      "grad_norm": 0.1628945767879486,
+      "learning_rate": 0.0001890202806752008,
+      "loss": 0.1287,
+      "step": 550
+    },
+    {
+      "epoch": 0.3594784353059178,
+      "grad_norm": 0.24853569269180298,
+      "learning_rate": 0.00018854188601966657,
+      "loss": 0.1397,
+      "step": 560
+    },
+    {
+      "epoch": 0.3658976930792377,
+      "grad_norm": 0.1540278196334839,
+      "learning_rate": 0.0001880539226036734,
+      "loss": 0.1343,
+      "step": 570
+    },
+    {
+      "epoch": 0.37231695085255767,
+      "grad_norm": 0.3391481935977936,
+      "learning_rate": 0.00018755644316164176,
+      "loss": 0.122,
+      "step": 580
+    },
+    {
+      "epoch": 0.37873620862587765,
+      "grad_norm": 0.19285668432712555,
+      "learning_rate": 0.0001870495014563931,
+      "loss": 0.1015,
+      "step": 590
+    },
+    {
+      "epoch": 0.38515546639919757,
+      "grad_norm": 0.19064725935459137,
+      "learning_rate": 0.00018653315227333992,
+      "loss": 0.1165,
+      "step": 600
+    },
+    {
+      "epoch": 0.39157472417251754,
+      "grad_norm": 0.21822167932987213,
+      "learning_rate": 0.00018600745141456485,
+      "loss": 0.1185,
+      "step": 610
+    },
+    {
+      "epoch": 0.3979939819458375,
+      "grad_norm": 0.21495632827281952,
+      "learning_rate": 0.0001854724556927903,
+      "loss": 0.1455,
+      "step": 620
+    },
+    {
+      "epoch": 0.4044132397191575,
+      "grad_norm": 0.21818260848522186,
+      "learning_rate": 0.00018492822292523863,
+      "loss": 0.1045,
+      "step": 630
+    },
+    {
+      "epoch": 0.4108324974924774,
+      "grad_norm": 0.2295123040676117,
+      "learning_rate": 0.0001843748119273837,
+      "loss": 0.1331,
+      "step": 640
+    },
+    {
+      "epoch": 0.4172517552657974,
+      "grad_norm": 0.3051343560218811,
+      "learning_rate": 0.0001838122825065948,
+      "loss": 0.1139,
+      "step": 650
+    },
+    {
+      "epoch": 0.42367101303911736,
+      "grad_norm": 0.24361808598041534,
+      "learning_rate": 0.0001832406954556732,
+      "loss": 0.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.43009027081243734,
+      "grad_norm": 0.19327011704444885,
+      "learning_rate": 0.00018266011254628218,
+      "loss": 0.087,
+      "step": 670
+    },
+    {
+      "epoch": 0.43650952858575726,
+      "grad_norm": 0.21391886472702026,
+      "learning_rate": 0.0001820705965222714,
+      "loss": 0.1079,
+      "step": 680
+    },
+    {
+      "epoch": 0.44292878635907723,
+      "grad_norm": 0.21966692805290222,
+      "learning_rate": 0.0001814722110928962,
+      "loss": 0.1188,
+      "step": 690
+    },
+    {
+      "epoch": 0.4493480441323972,
+      "grad_norm": 0.19202131032943726,
+      "learning_rate": 0.00018086502092593234,
+      "loss": 0.0991,
+      "step": 700
+    },
+    {
+      "epoch": 0.4557673019057171,
+      "grad_norm": 0.1604185700416565,
+      "learning_rate": 0.00018024909164068755,
+      "loss": 0.094,
+      "step": 710
+    },
+    {
+      "epoch": 0.4621865596790371,
+      "grad_norm": 0.21912045776844025,
+      "learning_rate": 0.00017962448980090982,
+      "loss": 0.1093,
+      "step": 720
+    },
+    {
+      "epoch": 0.4686058174523571,
+      "grad_norm": 0.16105616092681885,
+      "learning_rate": 0.00017899128290759395,
+      "loss": 0.0747,
+      "step": 730
+    },
+    {
+      "epoch": 0.47502507522567705,
+      "grad_norm": 0.19927042722702026,
+      "learning_rate": 0.00017834953939168663,
+      "loss": 0.1057,
+      "step": 740
+    },
+    {
+      "epoch": 0.48144433299899697,
+      "grad_norm": 0.2838548421859741,
+      "learning_rate": 0.00017769932860669111,
+      "loss": 0.0891,
+      "step": 750
+    },
+    {
+      "epoch": 0.48786359077231695,
+      "grad_norm": 0.16990593075752258,
+      "learning_rate": 0.00017704072082117215,
+      "loss": 0.0993,
+      "step": 760
+    },
+    {
+      "epoch": 0.4942828485456369,
+      "grad_norm": 0.22423957288265228,
+      "learning_rate": 0.00017637378721116197,
+      "loss": 0.0787,
+      "step": 770
+    },
+    {
+      "epoch": 0.5007021063189568,
+      "grad_norm": 0.224956676363945,
+      "learning_rate": 0.00017569859985246838,
+      "loss": 0.1019,
+      "step": 780
+    },
+    {
+      "epoch": 0.5071213640922768,
+      "grad_norm": 0.11727194488048553,
+      "learning_rate": 0.0001750152317128854,
+      "loss": 0.0806,
+      "step": 790
+    },
+    {
+      "epoch": 0.5135406218655968,
+      "grad_norm": 0.13485625386238098,
+      "learning_rate": 0.00017432375664430753,
+      "loss": 0.0873,
+      "step": 800
+    },
+    {
+      "epoch": 0.5199598796389168,
+      "grad_norm": 0.2887745201587677,
+      "learning_rate": 0.0001736242493747488,
+      "loss": 0.0796,
+      "step": 810
+    },
+    {
+      "epoch": 0.5263791374122367,
+      "grad_norm": 0.3110921084880829,
+      "learning_rate": 0.00017291678550026665,
+      "loss": 0.0912,
+      "step": 820
+    },
+    {
+      "epoch": 0.5327983951855567,
+      "grad_norm": 0.17630113661289215,
+      "learning_rate": 0.0001722014414767923,
+      "loss": 0.0795,
+      "step": 830
+    },
+    {
+      "epoch": 0.5392176529588766,
+      "grad_norm": 0.1666630357503891,
+      "learning_rate": 0.00017147829461186816,
+      "loss": 0.0791,
+      "step": 840
+    },
+    {
+      "epoch": 0.5456369107321966,
+      "grad_norm": 0.17798000574111938,
+      "learning_rate": 0.00017074742305629305,
+      "loss": 0.1046,
+      "step": 850
+    },
+    {
+      "epoch": 0.5520561685055165,
+      "grad_norm": 0.30929967761039734,
+      "learning_rate": 0.0001700089057956766,
+      "loss": 0.088,
+      "step": 860
+    },
+    {
+      "epoch": 0.5584754262788365,
+      "grad_norm": 0.12197452783584595,
+      "learning_rate": 0.00016926282264190313,
+      "loss": 0.0766,
+      "step": 870
+    },
+    {
+      "epoch": 0.5648946840521565,
+      "grad_norm": 0.254573255777359,
+      "learning_rate": 0.00016850925422450626,
+      "loss": 0.1153,
+      "step": 880
+    },
+    {
+      "epoch": 0.5713139418254765,
+      "grad_norm": 0.14357267320156097,
+      "learning_rate": 0.0001677482819819554,
+      "loss": 0.0919,
+      "step": 890
+    },
+    {
+      "epoch": 0.5777331995987964,
+      "grad_norm": 0.22417870163917542,
+      "learning_rate": 0.00016697998815285461,
+      "loss": 0.0771,
+      "step": 900
+    },
+    {
+      "epoch": 0.5841524573721163,
+      "grad_norm": 0.37406066060066223,
+      "learning_rate": 0.00016620445576705502,
+      "loss": 0.0783,
+      "step": 910
+    },
+    {
+      "epoch": 0.5905717151454363,
+      "grad_norm": 0.1301262080669403,
+      "learning_rate": 0.0001654217686366817,
+      "loss": 0.064,
+      "step": 920
+    },
+    {
+      "epoch": 0.5969909729187562,
+      "grad_norm": 0.13001087307929993,
+      "learning_rate": 0.0001646320113470761,
+      "loss": 0.0554,
+      "step": 930
+    },
+    {
+      "epoch": 0.6034102306920762,
+      "grad_norm": 0.1982804834842682,
+      "learning_rate": 0.00016383526924765494,
+      "loss": 0.0902,
+      "step": 940
+    },
+    {
+      "epoch": 0.6098294884653962,
+      "grad_norm": 0.14553162455558777,
+      "learning_rate": 0.0001630316284426864,
+      "loss": 0.073,
+      "step": 950
+    },
+    {
+      "epoch": 0.6162487462387162,
+      "grad_norm": 0.22459714114665985,
+      "learning_rate": 0.00016222117578198477,
+      "loss": 0.0874,
+      "step": 960
+    },
+    {
+      "epoch": 0.6226680040120361,
+      "grad_norm": 0.23248261213302612,
+      "learning_rate": 0.00016140399885152456,
+      "loss": 0.0607,
+      "step": 970
+    },
+    {
+      "epoch": 0.6290872617853561,
+      "grad_norm": 0.28904321789741516,
+      "learning_rate": 0.00016058018596397508,
+      "loss": 0.0708,
+      "step": 980
+    },
+    {
+      "epoch": 0.635506519558676,
+      "grad_norm": 0.18531730771064758,
+      "learning_rate": 0.00015974982614915643,
+      "loss": 0.064,
+      "step": 990
+    },
+    {
+      "epoch": 0.641925777331996,
+      "grad_norm": 0.16869930922985077,
+      "learning_rate": 0.00015891300914441803,
+      "loss": 0.0526,
+      "step": 1000
+    },
+    {
+      "epoch": 0.641925777331996,
+      "eval_loss": 0.08326917141675949,
+      "eval_runtime": 128.9614,
+      "eval_samples_per_second": 10.197,
+      "eval_steps_per_second": 10.197,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6483450351053159,
+      "grad_norm": 0.23396554589271545,
+      "learning_rate": 0.00015806982538494065,
+      "loss": 0.0624,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6547642928786359,
+      "grad_norm": 0.13306337594985962,
+      "learning_rate": 0.00015722036599396296,
+      "loss": 0.084,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6611835506519559,
+      "grad_norm": 0.14258597791194916,
+      "learning_rate": 0.00015636472277293396,
+      "loss": 0.0553,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6676028084252759,
+      "grad_norm": 0.17658811807632446,
+      "learning_rate": 0.00015550298819159189,
+      "loss": 0.0641,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6740220661985958,
+      "grad_norm": 0.22399111092090607,
+      "learning_rate": 0.0001546352553779709,
+      "loss": 0.0798,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6804413239719157,
+      "grad_norm": 0.25611406564712524,
+      "learning_rate": 0.0001537616181083368,
+      "loss": 0.062,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6868605817452357,
+      "grad_norm": 0.21297794580459595,
+      "learning_rate": 0.00015288217079705246,
+      "loss": 0.0557,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6932798395185557,
+      "grad_norm": 0.1601909101009369,
+      "learning_rate": 0.00015199700848637462,
+      "loss": 0.0838,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6996990972918756,
+      "grad_norm": 0.19593413174152374,
+      "learning_rate": 0.00015110622683618243,
+      "loss": 0.0578,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7061183550651956,
+      "grad_norm": 0.3398507237434387,
+      "learning_rate": 0.0001502099221136395,
+      "loss": 0.0738,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7125376128385156,
+      "grad_norm": 0.23959602415561676,
+      "learning_rate": 0.0001493081911827904,
+      "loss": 0.0562,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7189568706118356,
+      "grad_norm": 0.22912147641181946,
+      "learning_rate": 0.00014840113149409233,
+      "loss": 0.0708,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7253761283851554,
+      "grad_norm": 0.1650708019733429,
+      "learning_rate": 0.00014748884107388372,
+      "loss": 0.0552,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7317953861584754,
+      "grad_norm": 0.19332996010780334,
+      "learning_rate": 0.00014657141851379043,
+      "loss": 0.0473,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7382146439317954,
+      "grad_norm": 0.15983298420906067,
+      "learning_rate": 0.00014564896296007088,
+      "loss": 0.0465,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7446339017051153,
+      "grad_norm": 0.16740870475769043,
+      "learning_rate": 0.00014472157410290147,
+      "loss": 0.0482,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7510531594784353,
+      "grad_norm": 0.17517174780368805,
+      "learning_rate": 0.00014378935216560268,
+      "loss": 0.0678,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7574724172517553,
+      "grad_norm": 0.2131085991859436,
+      "learning_rate": 0.00014285239789380824,
+      "loss": 0.0436,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7638916750250753,
+      "grad_norm": 0.17330893874168396,
+      "learning_rate": 0.00014191081254457725,
+      "loss": 0.0504,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7703109327983951,
+      "grad_norm": 0.15023717284202576,
+      "learning_rate": 0.0001409646978754514,
+      "loss": 0.0531,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7767301905717151,
+      "grad_norm": 0.10652194917201996,
+      "learning_rate": 0.00014001415613345793,
+      "loss": 0.0468,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7831494483450351,
+      "grad_norm": 0.11492206156253815,
+      "learning_rate": 0.00013905929004405992,
+      "loss": 0.0497,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7895687061183551,
+      "grad_norm": 0.27618470788002014,
+      "learning_rate": 0.00013810020280005441,
+      "loss": 0.0696,
+      "step": 1230
+    },
+    {
+      "epoch": 0.795987963891675,
+      "grad_norm": 0.10219205170869827,
+      "learning_rate": 0.00013713699805042057,
+      "loss": 0.0514,
+      "step": 1240
+    },
+    {
+      "epoch": 0.802407221664995,
+      "grad_norm": 0.1219983845949173,
+      "learning_rate": 0.00013616977988911821,
+      "loss": 0.0378,
+      "step": 1250
+    },
+    {
+      "epoch": 0.808826479438315,
+      "grad_norm": 0.1977282017469406,
+      "learning_rate": 0.00013519865284383818,
+      "loss": 0.0639,
+      "step": 1260
+    },
+    {
+      "epoch": 0.815245737211635,
+      "grad_norm": 0.11138994246721268,
+      "learning_rate": 0.00013422372186470632,
+      "loss": 0.0575,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8216649949849548,
+      "grad_norm": 0.1591784507036209,
+      "learning_rate": 0.00013324509231294108,
+      "loss": 0.0399,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8280842527582748,
+      "grad_norm": 0.12934090197086334,
+      "learning_rate": 0.00013226286994946746,
+      "loss": 0.0469,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8345035105315948,
+      "grad_norm": 0.13661682605743408,
+      "learning_rate": 0.00013127716092348708,
+      "loss": 0.0441,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8409227683049147,
+      "grad_norm": 0.25683388113975525,
+      "learning_rate": 0.0001302880717610067,
+      "loss": 0.0644,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8473420260782347,
+      "grad_norm": 0.14545147120952606,
+      "learning_rate": 0.00012929570935332597,
+      "loss": 0.0446,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8537612838515547,
+      "grad_norm": 0.16420190036296844,
+      "learning_rate": 0.0001283001809454856,
+      "loss": 0.0597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8601805416248747,
+      "grad_norm": 0.2332782745361328,
+      "learning_rate": 0.00012730159412467726,
+      "loss": 0.0435,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8665997993981945,
+      "grad_norm": 0.09255204349756241,
+      "learning_rate": 0.00012630005680861668,
+      "loss": 0.0528,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8730190571715145,
+      "grad_norm": 0.2076631337404251,
+      "learning_rate": 0.00012529567723388088,
+      "loss": 0.0525,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8794383149448345,
+      "grad_norm": 0.17183755338191986,
+      "learning_rate": 0.0001242885639442111,
+      "loss": 0.0435,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8858575727181545,
+      "grad_norm": 0.1666991412639618,
+      "learning_rate": 0.00012327882577878226,
+      "loss": 0.0403,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8922768304914744,
+      "grad_norm": 0.1412462741136551,
+      "learning_rate": 0.00012226657186044086,
+      "loss": 0.0512,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8986960882647944,
+      "grad_norm": 0.13901075720787048,
+      "learning_rate": 0.00012125191158391178,
+      "loss": 0.0311,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9051153460381144,
+      "grad_norm": 0.1401802897453308,
+      "learning_rate": 0.00012023495460397614,
+      "loss": 0.0383,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9115346038114343,
+      "grad_norm": 0.18335814774036407,
+      "learning_rate": 0.00011921581082362092,
+      "loss": 0.0355,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9179538615847542,
+      "grad_norm": 0.1105201244354248,
+      "learning_rate": 0.00011819459038216143,
+      "loss": 0.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9243731193580742,
+      "grad_norm": 0.10848142206668854,
+      "learning_rate": 0.00011717140364333874,
+      "loss": 0.0333,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9307923771313942,
+      "grad_norm": 0.07852191478013992,
+      "learning_rate": 0.00011614636118339249,
+      "loss": 0.0466,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9372116349047142,
+      "grad_norm": 0.16577693819999695,
+      "learning_rate": 0.00011511957377911091,
+      "loss": 0.0533,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9436308926780341,
+      "grad_norm": 0.12513399124145508,
+      "learning_rate": 0.00011409115239585921,
+      "loss": 0.0371,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9500501504513541,
+      "grad_norm": 0.24522411823272705,
+      "learning_rate": 0.00011306120817558736,
+      "loss": 0.0463,
+      "step": 1480
+    },
+    {
+      "epoch": 0.956469408224674,
+      "grad_norm": 0.1543067842721939,
+      "learning_rate": 0.00011202985242481898,
+      "loss": 0.0411,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "grad_norm": 0.10811112076044083,
+      "learning_rate": 0.00011099719660262243,
+      "loss": 0.043,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "eval_loss": 0.057985104620456696,
+      "eval_runtime": 128.7458,
+      "eval_samples_per_second": 10.214,
+      "eval_steps_per_second": 10.214,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9693079237713139,
+      "grad_norm": 0.10716786235570908,
+      "learning_rate": 0.00010996335230856538,
+      "loss": 0.034,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9757271815446339,
+      "grad_norm": 0.12667109072208405,
+      "learning_rate": 0.00010892843127065416,
+      "loss": 0.0382,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9821464393179539,
+      "grad_norm": 0.22383131086826324,
+      "learning_rate": 0.00010789254533325929,
+      "loss": 0.0491,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9885656970912738,
+      "grad_norm": 0.09304796904325485,
+      "learning_rate": 0.00010685580644502837,
+      "loss": 0.0354,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9949849548645938,
+      "grad_norm": 0.07743366062641144,
+      "learning_rate": 0.0001058183266467878,
+      "loss": 0.042,
+      "step": 1550
+    },
+    {
+      "epoch": 1.001283851554664,
+      "grad_norm": 0.15325789153575897,
+      "learning_rate": 0.00010478021805943445,
+      "loss": 0.039,
+      "step": 1560
+    },
+    {
+      "epoch": 1.007703109327984,
+      "grad_norm": 0.07142600417137146,
+      "learning_rate": 0.00010374159287181868,
+      "loss": 0.0299,
+      "step": 1570
+    },
+    {
+      "epoch": 1.014122367101304,
+      "grad_norm": 0.07669597119092941,
+      "learning_rate": 0.00010270256332862014,
+      "loss": 0.0301,
+      "step": 1580
+    },
+    {
+      "epoch": 1.020541624874624,
+      "grad_norm": 0.07297486811876297,
+      "learning_rate": 0.00010166324171821721,
+      "loss": 0.0347,
+      "step": 1590
+    },
+    {
+      "epoch": 1.026960882647944,
+      "grad_norm": 0.20420107245445251,
+      "learning_rate": 0.00010062374036055226,
+      "loss": 0.0307,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0333801404212637,
+      "grad_norm": 0.09947313368320465,
+      "learning_rate": 9.958417159499298e-05,
+      "loss": 0.0307,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0397993981945837,
+      "grad_norm": 0.11809804290533066,
+      "learning_rate": 9.85446477681918e-05,
+      "loss": 0.0299,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0462186559679036,
+      "grad_norm": 0.2631526291370392,
+      "learning_rate": 9.750528122194467e-05,
+      "loss": 0.0387,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0526379137412236,
+      "grad_norm": 0.08276328444480896,
+      "learning_rate": 9.646618428105013e-05,
+      "loss": 0.0305,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0590571715145436,
+      "grad_norm": 0.11126093566417694,
+      "learning_rate": 9.542746924117037e-05,
+      "loss": 0.0315,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0654764292878636,
+      "grad_norm": 0.125727117061615,
+      "learning_rate": 9.438924835669532e-05,
+      "loss": 0.0272,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0718956870611835,
+      "grad_norm": 0.13635793328285217,
+      "learning_rate": 9.33516338286114e-05,
+      "loss": 0.0339,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0783149448345035,
+      "grad_norm": 0.1484992802143097,
+      "learning_rate": 9.231473779237579e-05,
+      "loss": 0.0301,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0847342026078235,
+      "grad_norm": 0.12370124459266663,
+      "learning_rate": 9.127867230579788e-05,
+      "loss": 0.0244,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0911534603811435,
+      "grad_norm": 0.11567872017621994,
+      "learning_rate": 9.024354933692935e-05,
+      "loss": 0.0291,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0975727181544634,
+      "grad_norm": 0.09546232968568802,
+      "learning_rate": 8.920948075196332e-05,
+      "loss": 0.0229,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1039919759277834,
+      "grad_norm": 0.05330915004014969,
+      "learning_rate": 8.817657830314546e-05,
+      "loss": 0.029,
+      "step": 1720
+    },
+    {
+      "epoch": 1.1104112337011034,
+      "grad_norm": 0.07367228716611862,
+      "learning_rate": 8.714495361669644e-05,
+      "loss": 0.0316,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1168304914744234,
+      "grad_norm": 0.056328821927309036,
+      "learning_rate": 8.61147181807486e-05,
+      "loss": 0.0256,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1232497492477433,
+      "grad_norm": 0.11536078155040741,
+      "learning_rate": 8.508598333329744e-05,
+      "loss": 0.0234,
+      "step": 1750
+    },
+    {
+      "epoch": 1.129669007021063,
+      "grad_norm": 0.09325211495161057,
+      "learning_rate": 8.405886025016911e-05,
+      "loss": 0.0316,
+      "step": 1760
+    },
+    {
+      "epoch": 1.136088264794383,
+      "grad_norm": 0.06355856359004974,
+      "learning_rate": 8.303345993300575e-05,
+      "loss": 0.0255,
+      "step": 1770
+    },
+    {
+      "epoch": 1.142507522567703,
+      "grad_norm": 0.11567080020904541,
+      "learning_rate": 8.200989319726937e-05,
+      "loss": 0.0253,
+      "step": 1780
+    },
+    {
+      "epoch": 1.148926780341023,
+      "grad_norm": 0.11177469789981842,
+      "learning_rate": 8.098827066026615e-05,
+      "loss": 0.025,
+      "step": 1790
+    },
+    {
+      "epoch": 1.155346038114343,
+      "grad_norm": 0.12497933954000473,
+      "learning_rate": 7.996870272919165e-05,
+      "loss": 0.0274,
+      "step": 1800
+    },
+    {
+      "epoch": 1.161765295887663,
+      "grad_norm": 0.11924732476472855,
+      "learning_rate": 7.895129958919947e-05,
+      "loss": 0.0321,
+      "step": 1810
+    },
+    {
+      "epoch": 1.168184553660983,
+      "grad_norm": 0.08141748607158661,
+      "learning_rate": 7.793617119149319e-05,
+      "loss": 0.0262,
+      "step": 1820
+    },
+    {
+      "epoch": 1.174603811434303,
+      "grad_norm": 0.11012791097164154,
+      "learning_rate": 7.692342724144397e-05,
+      "loss": 0.0226,
+      "step": 1830
+    },
+    {
+      "epoch": 1.181023069207623,
+      "grad_norm": 0.12142367660999298,
+      "learning_rate": 7.59131771867348e-05,
+      "loss": 0.0302,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1874423269809429,
+      "grad_norm": 0.04989106208086014,
+      "learning_rate": 7.490553020553214e-05,
+      "loss": 0.0297,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1938615847542629,
+      "grad_norm": 0.11036371439695358,
+      "learning_rate": 7.390059519468726e-05,
+      "loss": 0.0287,
+      "step": 1860
+    },
+    {
+      "epoch": 1.2002808425275828,
+      "grad_norm": 0.09587734192609787,
+      "learning_rate": 7.289848075796755e-05,
+      "loss": 0.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2067001003009028,
+      "grad_norm": 0.1048700138926506,
+      "learning_rate": 7.189929519431982e-05,
+      "loss": 0.0246,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2131193580742226,
+      "grad_norm": 0.1045086681842804,
+      "learning_rate": 7.090314648616607e-05,
+      "loss": 0.0245,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2195386158475428,
+      "grad_norm": 0.08027637004852295,
+      "learning_rate": 6.991014228773421e-05,
+      "loss": 0.027,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2259578736208625,
+      "grad_norm": 0.09890926629304886,
+      "learning_rate": 6.892038991342349e-05,
+      "loss": 0.0266,
+      "step": 1910
+    },
+    {
+      "epoch": 1.2323771313941825,
+      "grad_norm": 0.06575173139572144,
+      "learning_rate": 6.793399632620715e-05,
+      "loss": 0.0232,
+      "step": 1920
+    },
+    {
+      "epoch": 1.2387963891675025,
+      "grad_norm": 0.07365952432155609,
+      "learning_rate": 6.695106812607282e-05,
+      "loss": 0.0222,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2452156469408224,
+      "grad_norm": 0.1399465799331665,
+      "learning_rate": 6.597171153850219e-05,
+      "loss": 0.0239,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2516349047141424,
+      "grad_norm": 0.07138116657733917,
+      "learning_rate": 6.499603240299133e-05,
+      "loss": 0.0261,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2580541624874624,
+      "grad_norm": 0.07462996989488602,
+      "learning_rate": 6.40241361616123e-05,
+      "loss": 0.0263,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2644734202607824,
+      "grad_norm": 0.06026960164308548,
+      "learning_rate": 6.305612784761823e-05,
+      "loss": 0.0286,
+      "step": 1970
+    },
+    {
+      "epoch": 1.2708926780341023,
+      "grad_norm": 0.04298267886042595,
+      "learning_rate": 6.209211207409225e-05,
+      "loss": 0.0255,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2773119358074223,
+      "grad_norm": 0.053071849048137665,
+      "learning_rate": 6.113219302264174e-05,
+      "loss": 0.0217,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "grad_norm": 0.05462060496211052,
+      "learning_rate": 6.017647443213974e-05,
+      "loss": 0.0246,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "eval_loss": 0.046459589153528214,
+      "eval_runtime": 128.8944,
+      "eval_samples_per_second": 10.202,
+      "eval_steps_per_second": 10.202,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2901504513540623,
+      "grad_norm": 0.052726052701473236,
+      "learning_rate": 5.9225059587513454e-05,
+      "loss": 0.0227,
+      "step": 2010
+    },
+    {
+      "epoch": 1.296569709127382,
+      "grad_norm": 0.05038246139883995,
+      "learning_rate": 5.8278051308582505e-05,
+      "loss": 0.0273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.3029889669007022,
+      "grad_norm": 0.09061837941408157,
+      "learning_rate": 5.733555193894695e-05,
+      "loss": 0.0296,
+      "step": 2030
+    },
+    {
+      "epoch": 1.309408224674022,
+      "grad_norm": 0.11903833597898483,
+      "learning_rate": 5.6397663334927096e-05,
+      "loss": 0.0309,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3158274824473422,
+      "grad_norm": 0.05378040671348572,
+      "learning_rate": 5.5464486854555744e-05,
+      "loss": 0.0236,
+      "step": 2050
+    },
+    {
+      "epoch": 1.322246740220662,
+      "grad_norm": 0.049429334700107574,
+      "learning_rate": 5.453612334662446e-05,
+      "loss": 0.0248,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3286659979939819,
+      "grad_norm": 0.0931214764714241,
+      "learning_rate": 5.361267313978472e-05,
+      "loss": 0.0319,
+      "step": 2070
+    },
+    {
+      "epoch": 1.3350852557673019,
+      "grad_norm": 0.0589577853679657,
+      "learning_rate": 5.2694236031705446e-05,
+      "loss": 0.0253,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3415045135406218,
+      "grad_norm": 0.10711564123630524,
+      "learning_rate": 5.178091127828777e-05,
+      "loss": 0.0283,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3479237713139418,
+      "grad_norm": 0.0872858390212059,
+      "learning_rate": 5.087279758293837e-05,
+      "loss": 0.0237,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3543430290872618,
+      "grad_norm": 0.05081092566251755,
+      "learning_rate": 4.996999308590266e-05,
+      "loss": 0.0246,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3607622868605818,
+      "grad_norm": 0.06178516149520874,
+      "learning_rate": 4.907259535365859e-05,
+      "loss": 0.0238,
+      "step": 2120
+    },
+    {
+      "epoch": 1.3671815446339017,
+      "grad_norm": 0.054508257657289505,
+      "learning_rate": 4.818070136837275e-05,
+      "loss": 0.0209,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3736008024072217,
+      "grad_norm": 0.1331976354122162,
+      "learning_rate": 4.72944075174193e-05,
+      "loss": 0.022,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3800200601805417,
+      "grad_norm": 0.06243107095360756,
+      "learning_rate": 4.6413809582963484e-05,
+      "loss": 0.0172,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3864393179538617,
+      "grad_norm": 0.08247397094964981,
+      "learning_rate": 4.553900273161036e-05,
+      "loss": 0.0213,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3928585757271814,
+      "grad_norm": 0.06733040511608124,
+      "learning_rate": 4.467008150412e-05,
+      "loss": 0.0247,
+      "step": 2170
+    },
+    {
+      "epoch": 1.3992778335005016,
+      "grad_norm": 0.05423252657055855,
+      "learning_rate": 4.3807139805190613e-05,
+      "loss": 0.0247,
+      "step": 2180
+    },
+    {
+      "epoch": 1.4056970912738214,
+      "grad_norm": 0.10230846703052521,
+      "learning_rate": 4.295027089331013e-05,
+      "loss": 0.0212,
+      "step": 2190
+    },
+    {
+      "epoch": 1.4121163490471416,
+      "grad_norm": 0.05248994752764702,
+      "learning_rate": 4.2099567370677687e-05,
+      "loss": 0.0177,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4185356068204613,
+      "grad_norm": 0.07035645842552185,
+      "learning_rate": 4.125512117319612e-05,
+      "loss": 0.021,
+      "step": 2210
+    },
+    {
+      "epoch": 1.4249548645937813,
+      "grad_norm": 0.10099633783102036,
+      "learning_rate": 4.041702356053639e-05,
+      "loss": 0.0212,
+      "step": 2220
+    },
+    {
+      "epoch": 1.4313741223671013,
+      "grad_norm": 0.13029153645038605,
+      "learning_rate": 3.958536510627511e-05,
+      "loss": 0.0191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4377933801404212,
+      "grad_norm": 0.06044310703873634,
+      "learning_rate": 3.876023568810622e-05,
+      "loss": 0.0205,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4442126379137412,
+      "grad_norm": 0.07602677494287491,
+      "learning_rate": 3.794172447812785e-05,
+      "loss": 0.0184,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4506318956870612,
+      "grad_norm": 0.08735393732786179,
+      "learning_rate": 3.7129919933205536e-05,
+      "loss": 0.0251,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4570511534603812,
+      "grad_norm": 0.04306063801050186,
+      "learning_rate": 3.6324909785412445e-05,
+      "loss": 0.0183,
+      "step": 2270
+    },
+    {
+      "epoch": 1.4634704112337011,
+      "grad_norm": 0.05602416768670082,
+      "learning_rate": 3.552678103254838e-05,
+      "loss": 0.0229,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4698896690070211,
+      "grad_norm": 0.09133511781692505,
+      "learning_rate": 3.4735619928737764e-05,
+      "loss": 0.0211,
+      "step": 2290
+    },
+    {
+      "epoch": 1.476308926780341,
+      "grad_norm": 0.10781540721654892,
+      "learning_rate": 3.395151197510804e-05,
+      "loss": 0.0198,
+      "step": 2300
+    },
+    {
+      "epoch": 1.482728184553661,
+      "grad_norm": 0.06286901980638504,
+      "learning_rate": 3.3174541910549784e-05,
+      "loss": 0.0221,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4891474423269808,
+      "grad_norm": 0.046533193439245224,
+      "learning_rate": 3.2404793702558636e-05,
+      "loss": 0.0188,
+      "step": 2320
+    },
+    {
+      "epoch": 1.495566700100301,
+      "grad_norm": 0.034077636897563934,
+      "learning_rate": 3.1642350538161045e-05,
+      "loss": 0.017,
+      "step": 2330
+    },
+    {
+      "epoch": 1.5019859578736208,
+      "grad_norm": 0.053730156272649765,
+      "learning_rate": 3.088729481492424e-05,
+      "loss": 0.0204,
+      "step": 2340
+    },
+    {
+      "epoch": 1.508405215646941,
+      "grad_norm": 0.060676686465740204,
+      "learning_rate": 3.0139708132051424e-05,
+      "loss": 0.0201,
+      "step": 2350
+    },
+    {
+      "epoch": 1.5148244734202607,
+      "grad_norm": 0.09910279512405396,
+      "learning_rate": 2.939967128156328e-05,
+      "loss": 0.0173,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5212437311935807,
+      "grad_norm": 0.0654776319861412,
+      "learning_rate": 2.866726423956687e-05,
+      "loss": 0.0233,
+      "step": 2370
+    },
+    {
+      "epoch": 1.5276629889669007,
+      "grad_norm": 0.08738186955451965,
+      "learning_rate": 2.794256615761247e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5340822467402206,
+      "grad_norm": 0.12039466947317123,
+      "learning_rate": 2.7225655354139677e-05,
+      "loss": 0.0193,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5405015045135406,
+      "grad_norm": 0.12398708611726761,
+      "learning_rate": 2.6516609306013462e-05,
+      "loss": 0.024,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5469207622868606,
+      "grad_norm": 0.09554693102836609,
+      "learning_rate": 2.5815504640151267e-05,
+      "loss": 0.0195,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5533400200601806,
+      "grad_norm": 0.26604852080345154,
+      "learning_rate": 2.512241712524185e-05,
+      "loss": 0.0277,
+      "step": 2420
+    },
+    {
+      "epoch": 1.5597592778335005,
+      "grad_norm": 0.08400601893663406,
+      "learning_rate": 2.443742166355695e-05,
+      "loss": 0.0267,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5661785356068205,
+      "grad_norm": 0.062076788395643234,
+      "learning_rate": 2.3760592282856565e-05,
+      "loss": 0.0209,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5725977933801403,
+      "grad_norm": 0.10021471977233887,
+      "learning_rate": 2.309200212838878e-05,
+      "loss": 0.0225,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5790170511534605,
+      "grad_norm": 0.06898768991231918,
+      "learning_rate": 2.2431723454984778e-05,
+      "loss": 0.0175,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5854363089267802,
+      "grad_norm": 0.06565247476100922,
+      "learning_rate": 2.1779827619250458e-05,
+      "loss": 0.019,
+      "step": 2470
+    },
+    {
+      "epoch": 1.5918555667001004,
+      "grad_norm": 0.10120698809623718,
+      "learning_rate": 2.1136385071854715e-05,
+      "loss": 0.0214,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5982748244734202,
+      "grad_norm": 0.03913561999797821,
+      "learning_rate": 2.050146534991587e-05,
+      "loss": 0.0214,
+      "step": 2490
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "grad_norm": 0.07003747671842575,
+      "learning_rate": 1.987513706948678e-05,
+      "loss": 0.0175,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "eval_loss": 0.04088287055492401,
+      "eval_runtime": 129.5987,
+      "eval_samples_per_second": 10.147,
+      "eval_steps_per_second": 10.147,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3116,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3727062552054118e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cac3f796d8e8ed78082f710e5c9ee0db63889906ddeefcc55981584c04b6c12
+size 5624

checkpoint-2500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Coder-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Coder-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoint-3000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-3000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb453fefadc9b96c6f5f72e9b3c53ec46748149127236a2cdf80a24e561d676
+size 645975704

checkpoint-3000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-3000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-3000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e82443b51a44facea6f5954743aa62de551f6bb1b65125f3bede6aee7f55b42
+size 1292176234

checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff8cbb1dfbc921ccfabe33a95c06fb3b8aced50bbf529ea70e0af3b78f9ee445
+size 14244

checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f34325fb315c9b55a9077fafc2fe88f6759c4cbacfb64403bb93806d4f12f01a
+size 1064

checkpoint-3000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-3000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83396048d512ec1f3178af0d7c1f79a226bba041822614b0e26a4fd2d4b55bf7
+size 11421995

checkpoint-3000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2182 @@

+{
+  "best_global_step": 3000,
+  "best_metric": 0.03883149474859238,
+  "best_model_checkpoint": "./checkpoints/checkpoint-3000",
+  "epoch": 1.9256569709127382,
+  "eval_steps": 500,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064192577733199595,
+      "grad_norm": 0.734896719455719,
+      "learning_rate": 1.9148936170212766e-05,
+      "loss": 1.2808,
+      "step": 10
+    },
+    {
+      "epoch": 0.012838515546639919,
+      "grad_norm": 0.5790244340896606,
+      "learning_rate": 4.0425531914893614e-05,
+      "loss": 1.065,
+      "step": 20
+    },
+    {
+      "epoch": 0.01925777331995988,
+      "grad_norm": 0.20778240263462067,
+      "learning_rate": 6.170212765957447e-05,
+      "loss": 0.6486,
+      "step": 30
+    },
+    {
+      "epoch": 0.025677031093279838,
+      "grad_norm": 0.17519450187683105,
+      "learning_rate": 8.297872340425533e-05,
+      "loss": 0.559,
+      "step": 40
+    },
+    {
+      "epoch": 0.0320962888665998,
+      "grad_norm": 0.2367718517780304,
+      "learning_rate": 0.00010425531914893618,
+      "loss": 0.4906,
+      "step": 50
+    },
+    {
+      "epoch": 0.03851554663991976,
+      "grad_norm": 0.24615059792995453,
+      "learning_rate": 0.00012553191489361702,
+      "loss": 0.4413,
+      "step": 60
+    },
+    {
+      "epoch": 0.04493480441323972,
+      "grad_norm": 0.22158223390579224,
+      "learning_rate": 0.00014680851063829788,
+      "loss": 0.386,
+      "step": 70
+    },
+    {
+      "epoch": 0.051354062186559676,
+      "grad_norm": 0.18299002945423126,
+      "learning_rate": 0.00016808510638297873,
+      "loss": 0.37,
+      "step": 80
+    },
+    {
+      "epoch": 0.05777331995987964,
+      "grad_norm": 0.483330100774765,
+      "learning_rate": 0.00018936170212765957,
+      "loss": 0.408,
+      "step": 90
+    },
+    {
+      "epoch": 0.0641925777331996,
+      "grad_norm": 0.36728036403656006,
+      "learning_rate": 0.00019999864911039267,
+      "loss": 0.3986,
+      "step": 100
+    },
+    {
+      "epoch": 0.07061183550651956,
+      "grad_norm": 0.23549625277519226,
+      "learning_rate": 0.00019998784221252132,
+      "loss": 0.3367,
+      "step": 110
+    },
+    {
+      "epoch": 0.07703109327983952,
+      "grad_norm": 0.2653771936893463,
+      "learning_rate": 0.0001999662295846848,
+      "loss": 0.3457,
+      "step": 120
+    },
+    {
+      "epoch": 0.08345035105315948,
+      "grad_norm": 0.21738414466381073,
+      "learning_rate": 0.0001999338135625693,
+      "loss": 0.3075,
+      "step": 130
+    },
+    {
+      "epoch": 0.08986960882647944,
+      "grad_norm": 0.2068450152873993,
+      "learning_rate": 0.00019989059764938857,
+      "loss": 0.3423,
+      "step": 140
+    },
+    {
+      "epoch": 0.09628886659979939,
+      "grad_norm": 0.24901063740253448,
+      "learning_rate": 0.00019983658651550522,
+      "loss": 0.2767,
+      "step": 150
+    },
+    {
+      "epoch": 0.10270812437311935,
+      "grad_norm": 0.34664201736450195,
+      "learning_rate": 0.00019977178599792623,
+      "loss": 0.297,
+      "step": 160
+    },
+    {
+      "epoch": 0.10912738214643931,
+      "grad_norm": 0.29314759373664856,
+      "learning_rate": 0.00019969620309967198,
+      "loss": 0.2975,
+      "step": 170
+    },
+    {
+      "epoch": 0.11554663991975928,
+      "grad_norm": 0.3220290541648865,
+      "learning_rate": 0.0001996098459890194,
+      "loss": 0.2717,
+      "step": 180
+    },
+    {
+      "epoch": 0.12196589769307924,
+      "grad_norm": 0.26825714111328125,
+      "learning_rate": 0.00019951272399861938,
+      "loss": 0.262,
+      "step": 190
+    },
+    {
+      "epoch": 0.1283851554663992,
+      "grad_norm": 0.2678840458393097,
+      "learning_rate": 0.00019940484762448794,
+      "loss": 0.2439,
+      "step": 200
+    },
+    {
+      "epoch": 0.13480441323971915,
+      "grad_norm": 0.26287031173706055,
+      "learning_rate": 0.00019928622852487216,
+      "loss": 0.2463,
+      "step": 210
+    },
+    {
+      "epoch": 0.14122367101303912,
+      "grad_norm": 0.2625996172428131,
+      "learning_rate": 0.00019915687951899025,
+      "loss": 0.2158,
+      "step": 220
+    },
+    {
+      "epoch": 0.14764292878635907,
+      "grad_norm": 0.21532030403614044,
+      "learning_rate": 0.00019901681458564592,
+      "loss": 0.2505,
+      "step": 230
+    },
+    {
+      "epoch": 0.15406218655967904,
+      "grad_norm": 0.18523766100406647,
+      "learning_rate": 0.00019886604886171797,
+      "loss": 0.2648,
+      "step": 240
+    },
+    {
+      "epoch": 0.160481444332999,
+      "grad_norm": 0.1750323474407196,
+      "learning_rate": 0.00019870459864052435,
+      "loss": 0.1999,
+      "step": 250
+    },
+    {
+      "epoch": 0.16690070210631897,
+      "grad_norm": 0.21571072936058044,
+      "learning_rate": 0.00019853248137006123,
+      "loss": 0.2513,
+      "step": 260
+    },
+    {
+      "epoch": 0.1733199598796389,
+      "grad_norm": 0.4137316942214966,
+      "learning_rate": 0.00019834971565111758,
+      "loss": 0.2365,
+      "step": 270
+    },
+    {
+      "epoch": 0.1797392176529589,
+      "grad_norm": 0.1867353618144989,
+      "learning_rate": 0.0001981563212352648,
+      "loss": 0.2136,
+      "step": 280
+    },
+    {
+      "epoch": 0.18615847542627884,
+      "grad_norm": 0.24344463646411896,
+      "learning_rate": 0.0001979523190227222,
+      "loss": 0.2051,
+      "step": 290
+    },
+    {
+      "epoch": 0.19257773319959878,
+      "grad_norm": 0.32625189423561096,
+      "learning_rate": 0.0001977377310600984,
+      "loss": 0.2142,
+      "step": 300
+    },
+    {
+      "epoch": 0.19899699097291876,
+      "grad_norm": 0.2645917236804962,
+      "learning_rate": 0.00019751258053800865,
+      "loss": 0.2169,
+      "step": 310
+    },
+    {
+      "epoch": 0.2054162487462387,
+      "grad_norm": 0.2912643849849701,
+      "learning_rate": 0.0001972768917885686,
+      "loss": 0.2276,
+      "step": 320
+    },
+    {
+      "epoch": 0.21183550651955868,
+      "grad_norm": 0.18374884128570557,
+      "learning_rate": 0.00019703069028276482,
+      "loss": 0.181,
+      "step": 330
+    },
+    {
+      "epoch": 0.21825476429287863,
+      "grad_norm": 0.22864000499248505,
+      "learning_rate": 0.000196774002627702,
+      "loss": 0.2036,
+      "step": 340
+    },
+    {
+      "epoch": 0.2246740220661986,
+      "grad_norm": 0.1839989423751831,
+      "learning_rate": 0.00019650685656372763,
+      "loss": 0.1646,
+      "step": 350
+    },
+    {
+      "epoch": 0.23109327983951855,
+      "grad_norm": 0.2861062288284302,
+      "learning_rate": 0.000196229280961434,
+      "loss": 0.1854,
+      "step": 360
+    },
+    {
+      "epoch": 0.23751253761283853,
+      "grad_norm": 0.19540704786777496,
+      "learning_rate": 0.00019594130581853823,
+      "loss": 0.1444,
+      "step": 370
+    },
+    {
+      "epoch": 0.24393179538615847,
+      "grad_norm": 0.33926671743392944,
+      "learning_rate": 0.0001956429622566403,
+      "loss": 0.1933,
+      "step": 380
+    },
+    {
+      "epoch": 0.2503510531594784,
+      "grad_norm": 0.3747280538082123,
+      "learning_rate": 0.00019533428251785983,
+      "loss": 0.1932,
+      "step": 390
+    },
+    {
+      "epoch": 0.2567703109327984,
+      "grad_norm": 0.1787070333957672,
+      "learning_rate": 0.00019501529996135156,
+      "loss": 0.1855,
+      "step": 400
+    },
+    {
+      "epoch": 0.26318956870611837,
+      "grad_norm": 0.21175768971443176,
+      "learning_rate": 0.00019468604905970033,
+      "loss": 0.2053,
+      "step": 410
+    },
+    {
+      "epoch": 0.2696088264794383,
+      "grad_norm": 0.22584153711795807,
+      "learning_rate": 0.00019434656539519548,
+      "loss": 0.1551,
+      "step": 420
+    },
+    {
+      "epoch": 0.27602808425275827,
+      "grad_norm": 0.20716825127601624,
+      "learning_rate": 0.00019399688565598547,
+      "loss": 0.1708,
+      "step": 430
+    },
+    {
+      "epoch": 0.28244734202607824,
+      "grad_norm": 0.21005718410015106,
+      "learning_rate": 0.0001936370476321132,
+      "loss": 0.1279,
+      "step": 440
+    },
+    {
+      "epoch": 0.2888665997993982,
+      "grad_norm": 0.209135502576828,
+      "learning_rate": 0.00019326709021143167,
+      "loss": 0.1584,
+      "step": 450
+    },
+    {
+      "epoch": 0.29528585757271814,
+      "grad_norm": 0.2232855260372162,
+      "learning_rate": 0.00019288705337540166,
+      "loss": 0.1562,
+      "step": 460
+    },
+    {
+      "epoch": 0.3017051153460381,
+      "grad_norm": 0.2212464064359665,
+      "learning_rate": 0.0001924969781947707,
+      "loss": 0.1733,
+      "step": 470
+    },
+    {
+      "epoch": 0.3081243731193581,
+      "grad_norm": 0.18040229380130768,
+      "learning_rate": 0.00019209690682513465,
+      "loss": 0.12,
+      "step": 480
+    },
+    {
+      "epoch": 0.31454363089267806,
+      "grad_norm": 0.16415567696094513,
+      "learning_rate": 0.0001916868825023819,
+      "loss": 0.1257,
+      "step": 490
+    },
+    {
+      "epoch": 0.320962888665998,
+      "grad_norm": 0.18324217200279236,
+      "learning_rate": 0.00019126694953802093,
+      "loss": 0.138,
+      "step": 500
+    },
+    {
+      "epoch": 0.320962888665998,
+      "eval_loss": 0.15367139875888824,
+      "eval_runtime": 127.7222,
+      "eval_samples_per_second": 10.296,
+      "eval_steps_per_second": 10.296,
+      "step": 500
+    },
+    {
+      "epoch": 0.32738214643931796,
+      "grad_norm": 0.15339615941047668,
+      "learning_rate": 0.00019083715331439134,
+      "loss": 0.1653,
+      "step": 510
+    },
+    {
+      "epoch": 0.33380140421263793,
+      "grad_norm": 0.2750616669654846,
+      "learning_rate": 0.00019039754027975952,
+      "loss": 0.1834,
+      "step": 520
+    },
+    {
+      "epoch": 0.34022066198595785,
+      "grad_norm": 0.20012134313583374,
+      "learning_rate": 0.00018994815794329896,
+      "loss": 0.1278,
+      "step": 530
+    },
+    {
+      "epoch": 0.3466399197592778,
+      "grad_norm": 0.24493156373500824,
+      "learning_rate": 0.0001894890548699559,
+      "loss": 0.142,
+      "step": 540
+    },
+    {
+      "epoch": 0.3530591775325978,
+      "grad_norm": 0.1628945767879486,
+      "learning_rate": 0.0001890202806752008,
+      "loss": 0.1287,
+      "step": 550
+    },
+    {
+      "epoch": 0.3594784353059178,
+      "grad_norm": 0.24853569269180298,
+      "learning_rate": 0.00018854188601966657,
+      "loss": 0.1397,
+      "step": 560
+    },
+    {
+      "epoch": 0.3658976930792377,
+      "grad_norm": 0.1540278196334839,
+      "learning_rate": 0.0001880539226036734,
+      "loss": 0.1343,
+      "step": 570
+    },
+    {
+      "epoch": 0.37231695085255767,
+      "grad_norm": 0.3391481935977936,
+      "learning_rate": 0.00018755644316164176,
+      "loss": 0.122,
+      "step": 580
+    },
+    {
+      "epoch": 0.37873620862587765,
+      "grad_norm": 0.19285668432712555,
+      "learning_rate": 0.0001870495014563931,
+      "loss": 0.1015,
+      "step": 590
+    },
+    {
+      "epoch": 0.38515546639919757,
+      "grad_norm": 0.19064725935459137,
+      "learning_rate": 0.00018653315227333992,
+      "loss": 0.1165,
+      "step": 600
+    },
+    {
+      "epoch": 0.39157472417251754,
+      "grad_norm": 0.21822167932987213,
+      "learning_rate": 0.00018600745141456485,
+      "loss": 0.1185,
+      "step": 610
+    },
+    {
+      "epoch": 0.3979939819458375,
+      "grad_norm": 0.21495632827281952,
+      "learning_rate": 0.0001854724556927903,
+      "loss": 0.1455,
+      "step": 620
+    },
+    {
+      "epoch": 0.4044132397191575,
+      "grad_norm": 0.21818260848522186,
+      "learning_rate": 0.00018492822292523863,
+      "loss": 0.1045,
+      "step": 630
+    },
+    {
+      "epoch": 0.4108324974924774,
+      "grad_norm": 0.2295123040676117,
+      "learning_rate": 0.0001843748119273837,
+      "loss": 0.1331,
+      "step": 640
+    },
+    {
+      "epoch": 0.4172517552657974,
+      "grad_norm": 0.3051343560218811,
+      "learning_rate": 0.0001838122825065948,
+      "loss": 0.1139,
+      "step": 650
+    },
+    {
+      "epoch": 0.42367101303911736,
+      "grad_norm": 0.24361808598041534,
+      "learning_rate": 0.0001832406954556732,
+      "loss": 0.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.43009027081243734,
+      "grad_norm": 0.19327011704444885,
+      "learning_rate": 0.00018266011254628218,
+      "loss": 0.087,
+      "step": 670
+    },
+    {
+      "epoch": 0.43650952858575726,
+      "grad_norm": 0.21391886472702026,
+      "learning_rate": 0.0001820705965222714,
+      "loss": 0.1079,
+      "step": 680
+    },
+    {
+      "epoch": 0.44292878635907723,
+      "grad_norm": 0.21966692805290222,
+      "learning_rate": 0.0001814722110928962,
+      "loss": 0.1188,
+      "step": 690
+    },
+    {
+      "epoch": 0.4493480441323972,
+      "grad_norm": 0.19202131032943726,
+      "learning_rate": 0.00018086502092593234,
+      "loss": 0.0991,
+      "step": 700
+    },
+    {
+      "epoch": 0.4557673019057171,
+      "grad_norm": 0.1604185700416565,
+      "learning_rate": 0.00018024909164068755,
+      "loss": 0.094,
+      "step": 710
+    },
+    {
+      "epoch": 0.4621865596790371,
+      "grad_norm": 0.21912045776844025,
+      "learning_rate": 0.00017962448980090982,
+      "loss": 0.1093,
+      "step": 720
+    },
+    {
+      "epoch": 0.4686058174523571,
+      "grad_norm": 0.16105616092681885,
+      "learning_rate": 0.00017899128290759395,
+      "loss": 0.0747,
+      "step": 730
+    },
+    {
+      "epoch": 0.47502507522567705,
+      "grad_norm": 0.19927042722702026,
+      "learning_rate": 0.00017834953939168663,
+      "loss": 0.1057,
+      "step": 740
+    },
+    {
+      "epoch": 0.48144433299899697,
+      "grad_norm": 0.2838548421859741,
+      "learning_rate": 0.00017769932860669111,
+      "loss": 0.0891,
+      "step": 750
+    },
+    {
+      "epoch": 0.48786359077231695,
+      "grad_norm": 0.16990593075752258,
+      "learning_rate": 0.00017704072082117215,
+      "loss": 0.0993,
+      "step": 760
+    },
+    {
+      "epoch": 0.4942828485456369,
+      "grad_norm": 0.22423957288265228,
+      "learning_rate": 0.00017637378721116197,
+      "loss": 0.0787,
+      "step": 770
+    },
+    {
+      "epoch": 0.5007021063189568,
+      "grad_norm": 0.224956676363945,
+      "learning_rate": 0.00017569859985246838,
+      "loss": 0.1019,
+      "step": 780
+    },
+    {
+      "epoch": 0.5071213640922768,
+      "grad_norm": 0.11727194488048553,
+      "learning_rate": 0.0001750152317128854,
+      "loss": 0.0806,
+      "step": 790
+    },
+    {
+      "epoch": 0.5135406218655968,
+      "grad_norm": 0.13485625386238098,
+      "learning_rate": 0.00017432375664430753,
+      "loss": 0.0873,
+      "step": 800
+    },
+    {
+      "epoch": 0.5199598796389168,
+      "grad_norm": 0.2887745201587677,
+      "learning_rate": 0.0001736242493747488,
+      "loss": 0.0796,
+      "step": 810
+    },
+    {
+      "epoch": 0.5263791374122367,
+      "grad_norm": 0.3110921084880829,
+      "learning_rate": 0.00017291678550026665,
+      "loss": 0.0912,
+      "step": 820
+    },
+    {
+      "epoch": 0.5327983951855567,
+      "grad_norm": 0.17630113661289215,
+      "learning_rate": 0.0001722014414767923,
+      "loss": 0.0795,
+      "step": 830
+    },
+    {
+      "epoch": 0.5392176529588766,
+      "grad_norm": 0.1666630357503891,
+      "learning_rate": 0.00017147829461186816,
+      "loss": 0.0791,
+      "step": 840
+    },
+    {
+      "epoch": 0.5456369107321966,
+      "grad_norm": 0.17798000574111938,
+      "learning_rate": 0.00017074742305629305,
+      "loss": 0.1046,
+      "step": 850
+    },
+    {
+      "epoch": 0.5520561685055165,
+      "grad_norm": 0.30929967761039734,
+      "learning_rate": 0.0001700089057956766,
+      "loss": 0.088,
+      "step": 860
+    },
+    {
+      "epoch": 0.5584754262788365,
+      "grad_norm": 0.12197452783584595,
+      "learning_rate": 0.00016926282264190313,
+      "loss": 0.0766,
+      "step": 870
+    },
+    {
+      "epoch": 0.5648946840521565,
+      "grad_norm": 0.254573255777359,
+      "learning_rate": 0.00016850925422450626,
+      "loss": 0.1153,
+      "step": 880
+    },
+    {
+      "epoch": 0.5713139418254765,
+      "grad_norm": 0.14357267320156097,
+      "learning_rate": 0.0001677482819819554,
+      "loss": 0.0919,
+      "step": 890
+    },
+    {
+      "epoch": 0.5777331995987964,
+      "grad_norm": 0.22417870163917542,
+      "learning_rate": 0.00016697998815285461,
+      "loss": 0.0771,
+      "step": 900
+    },
+    {
+      "epoch": 0.5841524573721163,
+      "grad_norm": 0.37406066060066223,
+      "learning_rate": 0.00016620445576705502,
+      "loss": 0.0783,
+      "step": 910
+    },
+    {
+      "epoch": 0.5905717151454363,
+      "grad_norm": 0.1301262080669403,
+      "learning_rate": 0.0001654217686366817,
+      "loss": 0.064,
+      "step": 920
+    },
+    {
+      "epoch": 0.5969909729187562,
+      "grad_norm": 0.13001087307929993,
+      "learning_rate": 0.0001646320113470761,
+      "loss": 0.0554,
+      "step": 930
+    },
+    {
+      "epoch": 0.6034102306920762,
+      "grad_norm": 0.1982804834842682,
+      "learning_rate": 0.00016383526924765494,
+      "loss": 0.0902,
+      "step": 940
+    },
+    {
+      "epoch": 0.6098294884653962,
+      "grad_norm": 0.14553162455558777,
+      "learning_rate": 0.0001630316284426864,
+      "loss": 0.073,
+      "step": 950
+    },
+    {
+      "epoch": 0.6162487462387162,
+      "grad_norm": 0.22459714114665985,
+      "learning_rate": 0.00016222117578198477,
+      "loss": 0.0874,
+      "step": 960
+    },
+    {
+      "epoch": 0.6226680040120361,
+      "grad_norm": 0.23248261213302612,
+      "learning_rate": 0.00016140399885152456,
+      "loss": 0.0607,
+      "step": 970
+    },
+    {
+      "epoch": 0.6290872617853561,
+      "grad_norm": 0.28904321789741516,
+      "learning_rate": 0.00016058018596397508,
+      "loss": 0.0708,
+      "step": 980
+    },
+    {
+      "epoch": 0.635506519558676,
+      "grad_norm": 0.18531730771064758,
+      "learning_rate": 0.00015974982614915643,
+      "loss": 0.064,
+      "step": 990
+    },
+    {
+      "epoch": 0.641925777331996,
+      "grad_norm": 0.16869930922985077,
+      "learning_rate": 0.00015891300914441803,
+      "loss": 0.0526,
+      "step": 1000
+    },
+    {
+      "epoch": 0.641925777331996,
+      "eval_loss": 0.08326917141675949,
+      "eval_runtime": 128.9614,
+      "eval_samples_per_second": 10.197,
+      "eval_steps_per_second": 10.197,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6483450351053159,
+      "grad_norm": 0.23396554589271545,
+      "learning_rate": 0.00015806982538494065,
+      "loss": 0.0624,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6547642928786359,
+      "grad_norm": 0.13306337594985962,
+      "learning_rate": 0.00015722036599396296,
+      "loss": 0.084,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6611835506519559,
+      "grad_norm": 0.14258597791194916,
+      "learning_rate": 0.00015636472277293396,
+      "loss": 0.0553,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6676028084252759,
+      "grad_norm": 0.17658811807632446,
+      "learning_rate": 0.00015550298819159189,
+      "loss": 0.0641,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6740220661985958,
+      "grad_norm": 0.22399111092090607,
+      "learning_rate": 0.0001546352553779709,
+      "loss": 0.0798,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6804413239719157,
+      "grad_norm": 0.25611406564712524,
+      "learning_rate": 0.0001537616181083368,
+      "loss": 0.062,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6868605817452357,
+      "grad_norm": 0.21297794580459595,
+      "learning_rate": 0.00015288217079705246,
+      "loss": 0.0557,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6932798395185557,
+      "grad_norm": 0.1601909101009369,
+      "learning_rate": 0.00015199700848637462,
+      "loss": 0.0838,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6996990972918756,
+      "grad_norm": 0.19593413174152374,
+      "learning_rate": 0.00015110622683618243,
+      "loss": 0.0578,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7061183550651956,
+      "grad_norm": 0.3398507237434387,
+      "learning_rate": 0.0001502099221136395,
+      "loss": 0.0738,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7125376128385156,
+      "grad_norm": 0.23959602415561676,
+      "learning_rate": 0.0001493081911827904,
+      "loss": 0.0562,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7189568706118356,
+      "grad_norm": 0.22912147641181946,
+      "learning_rate": 0.00014840113149409233,
+      "loss": 0.0708,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7253761283851554,
+      "grad_norm": 0.1650708019733429,
+      "learning_rate": 0.00014748884107388372,
+      "loss": 0.0552,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7317953861584754,
+      "grad_norm": 0.19332996010780334,
+      "learning_rate": 0.00014657141851379043,
+      "loss": 0.0473,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7382146439317954,
+      "grad_norm": 0.15983298420906067,
+      "learning_rate": 0.00014564896296007088,
+      "loss": 0.0465,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7446339017051153,
+      "grad_norm": 0.16740870475769043,
+      "learning_rate": 0.00014472157410290147,
+      "loss": 0.0482,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7510531594784353,
+      "grad_norm": 0.17517174780368805,
+      "learning_rate": 0.00014378935216560268,
+      "loss": 0.0678,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7574724172517553,
+      "grad_norm": 0.2131085991859436,
+      "learning_rate": 0.00014285239789380824,
+      "loss": 0.0436,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7638916750250753,
+      "grad_norm": 0.17330893874168396,
+      "learning_rate": 0.00014191081254457725,
+      "loss": 0.0504,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7703109327983951,
+      "grad_norm": 0.15023717284202576,
+      "learning_rate": 0.0001409646978754514,
+      "loss": 0.0531,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7767301905717151,
+      "grad_norm": 0.10652194917201996,
+      "learning_rate": 0.00014001415613345793,
+      "loss": 0.0468,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7831494483450351,
+      "grad_norm": 0.11492206156253815,
+      "learning_rate": 0.00013905929004405992,
+      "loss": 0.0497,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7895687061183551,
+      "grad_norm": 0.27618470788002014,
+      "learning_rate": 0.00013810020280005441,
+      "loss": 0.0696,
+      "step": 1230
+    },
+    {
+      "epoch": 0.795987963891675,
+      "grad_norm": 0.10219205170869827,
+      "learning_rate": 0.00013713699805042057,
+      "loss": 0.0514,
+      "step": 1240
+    },
+    {
+      "epoch": 0.802407221664995,
+      "grad_norm": 0.1219983845949173,
+      "learning_rate": 0.00013616977988911821,
+      "loss": 0.0378,
+      "step": 1250
+    },
+    {
+      "epoch": 0.808826479438315,
+      "grad_norm": 0.1977282017469406,
+      "learning_rate": 0.00013519865284383818,
+      "loss": 0.0639,
+      "step": 1260
+    },
+    {
+      "epoch": 0.815245737211635,
+      "grad_norm": 0.11138994246721268,
+      "learning_rate": 0.00013422372186470632,
+      "loss": 0.0575,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8216649949849548,
+      "grad_norm": 0.1591784507036209,
+      "learning_rate": 0.00013324509231294108,
+      "loss": 0.0399,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8280842527582748,
+      "grad_norm": 0.12934090197086334,
+      "learning_rate": 0.00013226286994946746,
+      "loss": 0.0469,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8345035105315948,
+      "grad_norm": 0.13661682605743408,
+      "learning_rate": 0.00013127716092348708,
+      "loss": 0.0441,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8409227683049147,
+      "grad_norm": 0.25683388113975525,
+      "learning_rate": 0.0001302880717610067,
+      "loss": 0.0644,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8473420260782347,
+      "grad_norm": 0.14545147120952606,
+      "learning_rate": 0.00012929570935332597,
+      "loss": 0.0446,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8537612838515547,
+      "grad_norm": 0.16420190036296844,
+      "learning_rate": 0.0001283001809454856,
+      "loss": 0.0597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8601805416248747,
+      "grad_norm": 0.2332782745361328,
+      "learning_rate": 0.00012730159412467726,
+      "loss": 0.0435,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8665997993981945,
+      "grad_norm": 0.09255204349756241,
+      "learning_rate": 0.00012630005680861668,
+      "loss": 0.0528,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8730190571715145,
+      "grad_norm": 0.2076631337404251,
+      "learning_rate": 0.00012529567723388088,
+      "loss": 0.0525,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8794383149448345,
+      "grad_norm": 0.17183755338191986,
+      "learning_rate": 0.0001242885639442111,
+      "loss": 0.0435,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8858575727181545,
+      "grad_norm": 0.1666991412639618,
+      "learning_rate": 0.00012327882577878226,
+      "loss": 0.0403,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8922768304914744,
+      "grad_norm": 0.1412462741136551,
+      "learning_rate": 0.00012226657186044086,
+      "loss": 0.0512,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8986960882647944,
+      "grad_norm": 0.13901075720787048,
+      "learning_rate": 0.00012125191158391178,
+      "loss": 0.0311,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9051153460381144,
+      "grad_norm": 0.1401802897453308,
+      "learning_rate": 0.00012023495460397614,
+      "loss": 0.0383,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9115346038114343,
+      "grad_norm": 0.18335814774036407,
+      "learning_rate": 0.00011921581082362092,
+      "loss": 0.0355,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9179538615847542,
+      "grad_norm": 0.1105201244354248,
+      "learning_rate": 0.00011819459038216143,
+      "loss": 0.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9243731193580742,
+      "grad_norm": 0.10848142206668854,
+      "learning_rate": 0.00011717140364333874,
+      "loss": 0.0333,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9307923771313942,
+      "grad_norm": 0.07852191478013992,
+      "learning_rate": 0.00011614636118339249,
+      "loss": 0.0466,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9372116349047142,
+      "grad_norm": 0.16577693819999695,
+      "learning_rate": 0.00011511957377911091,
+      "loss": 0.0533,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9436308926780341,
+      "grad_norm": 0.12513399124145508,
+      "learning_rate": 0.00011409115239585921,
+      "loss": 0.0371,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9500501504513541,
+      "grad_norm": 0.24522411823272705,
+      "learning_rate": 0.00011306120817558736,
+      "loss": 0.0463,
+      "step": 1480
+    },
+    {
+      "epoch": 0.956469408224674,
+      "grad_norm": 0.1543067842721939,
+      "learning_rate": 0.00011202985242481898,
+      "loss": 0.0411,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "grad_norm": 0.10811112076044083,
+      "learning_rate": 0.00011099719660262243,
+      "loss": 0.043,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "eval_loss": 0.057985104620456696,
+      "eval_runtime": 128.7458,
+      "eval_samples_per_second": 10.214,
+      "eval_steps_per_second": 10.214,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9693079237713139,
+      "grad_norm": 0.10716786235570908,
+      "learning_rate": 0.00010996335230856538,
+      "loss": 0.034,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9757271815446339,
+      "grad_norm": 0.12667109072208405,
+      "learning_rate": 0.00010892843127065416,
+      "loss": 0.0382,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9821464393179539,
+      "grad_norm": 0.22383131086826324,
+      "learning_rate": 0.00010789254533325929,
+      "loss": 0.0491,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9885656970912738,
+      "grad_norm": 0.09304796904325485,
+      "learning_rate": 0.00010685580644502837,
+      "loss": 0.0354,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9949849548645938,
+      "grad_norm": 0.07743366062641144,
+      "learning_rate": 0.0001058183266467878,
+      "loss": 0.042,
+      "step": 1550
+    },
+    {
+      "epoch": 1.001283851554664,
+      "grad_norm": 0.15325789153575897,
+      "learning_rate": 0.00010478021805943445,
+      "loss": 0.039,
+      "step": 1560
+    },
+    {
+      "epoch": 1.007703109327984,
+      "grad_norm": 0.07142600417137146,
+      "learning_rate": 0.00010374159287181868,
+      "loss": 0.0299,
+      "step": 1570
+    },
+    {
+      "epoch": 1.014122367101304,
+      "grad_norm": 0.07669597119092941,
+      "learning_rate": 0.00010270256332862014,
+      "loss": 0.0301,
+      "step": 1580
+    },
+    {
+      "epoch": 1.020541624874624,
+      "grad_norm": 0.07297486811876297,
+      "learning_rate": 0.00010166324171821721,
+      "loss": 0.0347,
+      "step": 1590
+    },
+    {
+      "epoch": 1.026960882647944,
+      "grad_norm": 0.20420107245445251,
+      "learning_rate": 0.00010062374036055226,
+      "loss": 0.0307,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0333801404212637,
+      "grad_norm": 0.09947313368320465,
+      "learning_rate": 9.958417159499298e-05,
+      "loss": 0.0307,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0397993981945837,
+      "grad_norm": 0.11809804290533066,
+      "learning_rate": 9.85446477681918e-05,
+      "loss": 0.0299,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0462186559679036,
+      "grad_norm": 0.2631526291370392,
+      "learning_rate": 9.750528122194467e-05,
+      "loss": 0.0387,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0526379137412236,
+      "grad_norm": 0.08276328444480896,
+      "learning_rate": 9.646618428105013e-05,
+      "loss": 0.0305,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0590571715145436,
+      "grad_norm": 0.11126093566417694,
+      "learning_rate": 9.542746924117037e-05,
+      "loss": 0.0315,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0654764292878636,
+      "grad_norm": 0.125727117061615,
+      "learning_rate": 9.438924835669532e-05,
+      "loss": 0.0272,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0718956870611835,
+      "grad_norm": 0.13635793328285217,
+      "learning_rate": 9.33516338286114e-05,
+      "loss": 0.0339,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0783149448345035,
+      "grad_norm": 0.1484992802143097,
+      "learning_rate": 9.231473779237579e-05,
+      "loss": 0.0301,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0847342026078235,
+      "grad_norm": 0.12370124459266663,
+      "learning_rate": 9.127867230579788e-05,
+      "loss": 0.0244,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0911534603811435,
+      "grad_norm": 0.11567872017621994,
+      "learning_rate": 9.024354933692935e-05,
+      "loss": 0.0291,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0975727181544634,
+      "grad_norm": 0.09546232968568802,
+      "learning_rate": 8.920948075196332e-05,
+      "loss": 0.0229,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1039919759277834,
+      "grad_norm": 0.05330915004014969,
+      "learning_rate": 8.817657830314546e-05,
+      "loss": 0.029,
+      "step": 1720
+    },
+    {
+      "epoch": 1.1104112337011034,
+      "grad_norm": 0.07367228716611862,
+      "learning_rate": 8.714495361669644e-05,
+      "loss": 0.0316,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1168304914744234,
+      "grad_norm": 0.056328821927309036,
+      "learning_rate": 8.61147181807486e-05,
+      "loss": 0.0256,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1232497492477433,
+      "grad_norm": 0.11536078155040741,
+      "learning_rate": 8.508598333329744e-05,
+      "loss": 0.0234,
+      "step": 1750
+    },
+    {
+      "epoch": 1.129669007021063,
+      "grad_norm": 0.09325211495161057,
+      "learning_rate": 8.405886025016911e-05,
+      "loss": 0.0316,
+      "step": 1760
+    },
+    {
+      "epoch": 1.136088264794383,
+      "grad_norm": 0.06355856359004974,
+      "learning_rate": 8.303345993300575e-05,
+      "loss": 0.0255,
+      "step": 1770
+    },
+    {
+      "epoch": 1.142507522567703,
+      "grad_norm": 0.11567080020904541,
+      "learning_rate": 8.200989319726937e-05,
+      "loss": 0.0253,
+      "step": 1780
+    },
+    {
+      "epoch": 1.148926780341023,
+      "grad_norm": 0.11177469789981842,
+      "learning_rate": 8.098827066026615e-05,
+      "loss": 0.025,
+      "step": 1790
+    },
+    {
+      "epoch": 1.155346038114343,
+      "grad_norm": 0.12497933954000473,
+      "learning_rate": 7.996870272919165e-05,
+      "loss": 0.0274,
+      "step": 1800
+    },
+    {
+      "epoch": 1.161765295887663,
+      "grad_norm": 0.11924732476472855,
+      "learning_rate": 7.895129958919947e-05,
+      "loss": 0.0321,
+      "step": 1810
+    },
+    {
+      "epoch": 1.168184553660983,
+      "grad_norm": 0.08141748607158661,
+      "learning_rate": 7.793617119149319e-05,
+      "loss": 0.0262,
+      "step": 1820
+    },
+    {
+      "epoch": 1.174603811434303,
+      "grad_norm": 0.11012791097164154,
+      "learning_rate": 7.692342724144397e-05,
+      "loss": 0.0226,
+      "step": 1830
+    },
+    {
+      "epoch": 1.181023069207623,
+      "grad_norm": 0.12142367660999298,
+      "learning_rate": 7.59131771867348e-05,
+      "loss": 0.0302,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1874423269809429,
+      "grad_norm": 0.04989106208086014,
+      "learning_rate": 7.490553020553214e-05,
+      "loss": 0.0297,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1938615847542629,
+      "grad_norm": 0.11036371439695358,
+      "learning_rate": 7.390059519468726e-05,
+      "loss": 0.0287,
+      "step": 1860
+    },
+    {
+      "epoch": 1.2002808425275828,
+      "grad_norm": 0.09587734192609787,
+      "learning_rate": 7.289848075796755e-05,
+      "loss": 0.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2067001003009028,
+      "grad_norm": 0.1048700138926506,
+      "learning_rate": 7.189929519431982e-05,
+      "loss": 0.0246,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2131193580742226,
+      "grad_norm": 0.1045086681842804,
+      "learning_rate": 7.090314648616607e-05,
+      "loss": 0.0245,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2195386158475428,
+      "grad_norm": 0.08027637004852295,
+      "learning_rate": 6.991014228773421e-05,
+      "loss": 0.027,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2259578736208625,
+      "grad_norm": 0.09890926629304886,
+      "learning_rate": 6.892038991342349e-05,
+      "loss": 0.0266,
+      "step": 1910
+    },
+    {
+      "epoch": 1.2323771313941825,
+      "grad_norm": 0.06575173139572144,
+      "learning_rate": 6.793399632620715e-05,
+      "loss": 0.0232,
+      "step": 1920
+    },
+    {
+      "epoch": 1.2387963891675025,
+      "grad_norm": 0.07365952432155609,
+      "learning_rate": 6.695106812607282e-05,
+      "loss": 0.0222,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2452156469408224,
+      "grad_norm": 0.1399465799331665,
+      "learning_rate": 6.597171153850219e-05,
+      "loss": 0.0239,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2516349047141424,
+      "grad_norm": 0.07138116657733917,
+      "learning_rate": 6.499603240299133e-05,
+      "loss": 0.0261,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2580541624874624,
+      "grad_norm": 0.07462996989488602,
+      "learning_rate": 6.40241361616123e-05,
+      "loss": 0.0263,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2644734202607824,
+      "grad_norm": 0.06026960164308548,
+      "learning_rate": 6.305612784761823e-05,
+      "loss": 0.0286,
+      "step": 1970
+    },
+    {
+      "epoch": 1.2708926780341023,
+      "grad_norm": 0.04298267886042595,
+      "learning_rate": 6.209211207409225e-05,
+      "loss": 0.0255,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2773119358074223,
+      "grad_norm": 0.053071849048137665,
+      "learning_rate": 6.113219302264174e-05,
+      "loss": 0.0217,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "grad_norm": 0.05462060496211052,
+      "learning_rate": 6.017647443213974e-05,
+      "loss": 0.0246,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "eval_loss": 0.046459589153528214,
+      "eval_runtime": 128.8944,
+      "eval_samples_per_second": 10.202,
+      "eval_steps_per_second": 10.202,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2901504513540623,
+      "grad_norm": 0.052726052701473236,
+      "learning_rate": 5.9225059587513454e-05,
+      "loss": 0.0227,
+      "step": 2010
+    },
+    {
+      "epoch": 1.296569709127382,
+      "grad_norm": 0.05038246139883995,
+      "learning_rate": 5.8278051308582505e-05,
+      "loss": 0.0273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.3029889669007022,
+      "grad_norm": 0.09061837941408157,
+      "learning_rate": 5.733555193894695e-05,
+      "loss": 0.0296,
+      "step": 2030
+    },
+    {
+      "epoch": 1.309408224674022,
+      "grad_norm": 0.11903833597898483,
+      "learning_rate": 5.6397663334927096e-05,
+      "loss": 0.0309,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3158274824473422,
+      "grad_norm": 0.05378040671348572,
+      "learning_rate": 5.5464486854555744e-05,
+      "loss": 0.0236,
+      "step": 2050
+    },
+    {
+      "epoch": 1.322246740220662,
+      "grad_norm": 0.049429334700107574,
+      "learning_rate": 5.453612334662446e-05,
+      "loss": 0.0248,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3286659979939819,
+      "grad_norm": 0.0931214764714241,
+      "learning_rate": 5.361267313978472e-05,
+      "loss": 0.0319,
+      "step": 2070
+    },
+    {
+      "epoch": 1.3350852557673019,
+      "grad_norm": 0.0589577853679657,
+      "learning_rate": 5.2694236031705446e-05,
+      "loss": 0.0253,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3415045135406218,
+      "grad_norm": 0.10711564123630524,
+      "learning_rate": 5.178091127828777e-05,
+      "loss": 0.0283,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3479237713139418,
+      "grad_norm": 0.0872858390212059,
+      "learning_rate": 5.087279758293837e-05,
+      "loss": 0.0237,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3543430290872618,
+      "grad_norm": 0.05081092566251755,
+      "learning_rate": 4.996999308590266e-05,
+      "loss": 0.0246,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3607622868605818,
+      "grad_norm": 0.06178516149520874,
+      "learning_rate": 4.907259535365859e-05,
+      "loss": 0.0238,
+      "step": 2120
+    },
+    {
+      "epoch": 1.3671815446339017,
+      "grad_norm": 0.054508257657289505,
+      "learning_rate": 4.818070136837275e-05,
+      "loss": 0.0209,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3736008024072217,
+      "grad_norm": 0.1331976354122162,
+      "learning_rate": 4.72944075174193e-05,
+      "loss": 0.022,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3800200601805417,
+      "grad_norm": 0.06243107095360756,
+      "learning_rate": 4.6413809582963484e-05,
+      "loss": 0.0172,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3864393179538617,
+      "grad_norm": 0.08247397094964981,
+      "learning_rate": 4.553900273161036e-05,
+      "loss": 0.0213,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3928585757271814,
+      "grad_norm": 0.06733040511608124,
+      "learning_rate": 4.467008150412e-05,
+      "loss": 0.0247,
+      "step": 2170
+    },
+    {
+      "epoch": 1.3992778335005016,
+      "grad_norm": 0.05423252657055855,
+      "learning_rate": 4.3807139805190613e-05,
+      "loss": 0.0247,
+      "step": 2180
+    },
+    {
+      "epoch": 1.4056970912738214,
+      "grad_norm": 0.10230846703052521,
+      "learning_rate": 4.295027089331013e-05,
+      "loss": 0.0212,
+      "step": 2190
+    },
+    {
+      "epoch": 1.4121163490471416,
+      "grad_norm": 0.05248994752764702,
+      "learning_rate": 4.2099567370677687e-05,
+      "loss": 0.0177,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4185356068204613,
+      "grad_norm": 0.07035645842552185,
+      "learning_rate": 4.125512117319612e-05,
+      "loss": 0.021,
+      "step": 2210
+    },
+    {
+      "epoch": 1.4249548645937813,
+      "grad_norm": 0.10099633783102036,
+      "learning_rate": 4.041702356053639e-05,
+      "loss": 0.0212,
+      "step": 2220
+    },
+    {
+      "epoch": 1.4313741223671013,
+      "grad_norm": 0.13029153645038605,
+      "learning_rate": 3.958536510627511e-05,
+      "loss": 0.0191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4377933801404212,
+      "grad_norm": 0.06044310703873634,
+      "learning_rate": 3.876023568810622e-05,
+      "loss": 0.0205,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4442126379137412,
+      "grad_norm": 0.07602677494287491,
+      "learning_rate": 3.794172447812785e-05,
+      "loss": 0.0184,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4506318956870612,
+      "grad_norm": 0.08735393732786179,
+      "learning_rate": 3.7129919933205536e-05,
+      "loss": 0.0251,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4570511534603812,
+      "grad_norm": 0.04306063801050186,
+      "learning_rate": 3.6324909785412445e-05,
+      "loss": 0.0183,
+      "step": 2270
+    },
+    {
+      "epoch": 1.4634704112337011,
+      "grad_norm": 0.05602416768670082,
+      "learning_rate": 3.552678103254838e-05,
+      "loss": 0.0229,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4698896690070211,
+      "grad_norm": 0.09133511781692505,
+      "learning_rate": 3.4735619928737764e-05,
+      "loss": 0.0211,
+      "step": 2290
+    },
+    {
+      "epoch": 1.476308926780341,
+      "grad_norm": 0.10781540721654892,
+      "learning_rate": 3.395151197510804e-05,
+      "loss": 0.0198,
+      "step": 2300
+    },
+    {
+      "epoch": 1.482728184553661,
+      "grad_norm": 0.06286901980638504,
+      "learning_rate": 3.3174541910549784e-05,
+      "loss": 0.0221,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4891474423269808,
+      "grad_norm": 0.046533193439245224,
+      "learning_rate": 3.2404793702558636e-05,
+      "loss": 0.0188,
+      "step": 2320
+    },
+    {
+      "epoch": 1.495566700100301,
+      "grad_norm": 0.034077636897563934,
+      "learning_rate": 3.1642350538161045e-05,
+      "loss": 0.017,
+      "step": 2330
+    },
+    {
+      "epoch": 1.5019859578736208,
+      "grad_norm": 0.053730156272649765,
+      "learning_rate": 3.088729481492424e-05,
+      "loss": 0.0204,
+      "step": 2340
+    },
+    {
+      "epoch": 1.508405215646941,
+      "grad_norm": 0.060676686465740204,
+      "learning_rate": 3.0139708132051424e-05,
+      "loss": 0.0201,
+      "step": 2350
+    },
+    {
+      "epoch": 1.5148244734202607,
+      "grad_norm": 0.09910279512405396,
+      "learning_rate": 2.939967128156328e-05,
+      "loss": 0.0173,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5212437311935807,
+      "grad_norm": 0.0654776319861412,
+      "learning_rate": 2.866726423956687e-05,
+      "loss": 0.0233,
+      "step": 2370
+    },
+    {
+      "epoch": 1.5276629889669007,
+      "grad_norm": 0.08738186955451965,
+      "learning_rate": 2.794256615761247e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5340822467402206,
+      "grad_norm": 0.12039466947317123,
+      "learning_rate": 2.7225655354139677e-05,
+      "loss": 0.0193,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5405015045135406,
+      "grad_norm": 0.12398708611726761,
+      "learning_rate": 2.6516609306013462e-05,
+      "loss": 0.024,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5469207622868606,
+      "grad_norm": 0.09554693102836609,
+      "learning_rate": 2.5815504640151267e-05,
+      "loss": 0.0195,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5533400200601806,
+      "grad_norm": 0.26604852080345154,
+      "learning_rate": 2.512241712524185e-05,
+      "loss": 0.0277,
+      "step": 2420
+    },
+    {
+      "epoch": 1.5597592778335005,
+      "grad_norm": 0.08400601893663406,
+      "learning_rate": 2.443742166355695e-05,
+      "loss": 0.0267,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5661785356068205,
+      "grad_norm": 0.062076788395643234,
+      "learning_rate": 2.3760592282856565e-05,
+      "loss": 0.0209,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5725977933801403,
+      "grad_norm": 0.10021471977233887,
+      "learning_rate": 2.309200212838878e-05,
+      "loss": 0.0225,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5790170511534605,
+      "grad_norm": 0.06898768991231918,
+      "learning_rate": 2.2431723454984778e-05,
+      "loss": 0.0175,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5854363089267802,
+      "grad_norm": 0.06565247476100922,
+      "learning_rate": 2.1779827619250458e-05,
+      "loss": 0.019,
+      "step": 2470
+    },
+    {
+      "epoch": 1.5918555667001004,
+      "grad_norm": 0.10120698809623718,
+      "learning_rate": 2.1136385071854715e-05,
+      "loss": 0.0214,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5982748244734202,
+      "grad_norm": 0.03913561999797821,
+      "learning_rate": 2.050146534991587e-05,
+      "loss": 0.0214,
+      "step": 2490
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "grad_norm": 0.07003747671842575,
+      "learning_rate": 1.987513706948678e-05,
+      "loss": 0.0175,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "eval_loss": 0.04088287055492401,
+      "eval_runtime": 129.5987,
+      "eval_samples_per_second": 10.147,
+      "eval_steps_per_second": 10.147,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6111133400200601,
+      "grad_norm": 0.06662087887525558,
+      "learning_rate": 1.9257467918139428e-05,
+      "loss": 0.0212,
+      "step": 2510
+    },
+    {
+      "epoch": 1.61753259779338,
+      "grad_norm": 0.10407735407352448,
+      "learning_rate": 1.8648524647649925e-05,
+      "loss": 0.0227,
+      "step": 2520
+    },
+    {
+      "epoch": 1.6239518555667,
+      "grad_norm": 0.048161495476961136,
+      "learning_rate": 1.8048373066784575e-05,
+      "loss": 0.0193,
+      "step": 2530
+    },
+    {
+      "epoch": 1.63037111334002,
+      "grad_norm": 0.04323771968483925,
+      "learning_rate": 1.7457078034188068e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 1.63679037111334,
+      "grad_norm": 0.030361974611878395,
+      "learning_rate": 1.687470345137383e-05,
+      "loss": 0.0161,
+      "step": 2550
+    },
+    {
+      "epoch": 1.64320962888666,
+      "grad_norm": 0.064698725938797,
+      "learning_rate": 1.63013122558185e-05,
+      "loss": 0.0218,
+      "step": 2560
+    },
+    {
+      "epoch": 1.64962888665998,
+      "grad_norm": 0.04122168570756912,
+      "learning_rate": 1.5736966414160103e-05,
+      "loss": 0.0205,
+      "step": 2570
+    },
+    {
+      "epoch": 1.6560481444333,
+      "grad_norm": 0.0699082463979721,
+      "learning_rate": 1.5181726915501272e-05,
+      "loss": 0.0228,
+      "step": 2580
+    },
+    {
+      "epoch": 1.66246740220662,
+      "grad_norm": 0.08534803241491318,
+      "learning_rate": 1.4635653764818169e-05,
+      "loss": 0.0203,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6688866599799397,
+      "grad_norm": 0.04532073438167572,
+      "learning_rate": 1.4098805976475704e-05,
+      "loss": 0.0219,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6753059177532599,
+      "grad_norm": 0.06504928320646286,
+      "learning_rate": 1.3571241567849856e-05,
+      "loss": 0.0166,
+      "step": 2610
+    },
+    {
+      "epoch": 1.6817251755265796,
+      "grad_norm": 0.1507658064365387,
+      "learning_rate": 1.3053017553057656e-05,
+      "loss": 0.0217,
+      "step": 2620
+    },
+    {
+      "epoch": 1.6881444332998998,
+      "grad_norm": 0.05026421695947647,
+      "learning_rate": 1.2544189936795715e-05,
+      "loss": 0.0174,
+      "step": 2630
+    },
+    {
+      "epoch": 1.6945636910732196,
+      "grad_norm": 0.07838805764913559,
+      "learning_rate": 1.204481370828765e-05,
+      "loss": 0.0188,
+      "step": 2640
+    },
+    {
+      "epoch": 1.7009829488465398,
+      "grad_norm": 0.03739321231842041,
+      "learning_rate": 1.1554942835341565e-05,
+      "loss": 0.0162,
+      "step": 2650
+    },
+    {
+      "epoch": 1.7074022066198595,
+      "grad_norm": 0.07961956411600113,
+      "learning_rate": 1.1074630258517538e-05,
+      "loss": 0.0186,
+      "step": 2660
+    },
+    {
+      "epoch": 1.7138214643931795,
+      "grad_norm": 0.047265585511922836,
+      "learning_rate": 1.0603927885406451e-05,
+      "loss": 0.0211,
+      "step": 2670
+    },
+    {
+      "epoch": 1.7202407221664995,
+      "grad_norm": 0.08342643082141876,
+      "learning_rate": 1.0142886585020218e-05,
+      "loss": 0.0197,
+      "step": 2680
+    },
+    {
+      "epoch": 1.7266599799398195,
+      "grad_norm": 0.0401916429400444,
+      "learning_rate": 9.691556182294392e-06,
+      "loss": 0.0184,
+      "step": 2690
+    },
+    {
+      "epoch": 1.7330792377131394,
+      "grad_norm": 0.10155277699232101,
+      "learning_rate": 9.249985452703557e-06,
+      "loss": 0.0199,
+      "step": 2700
+    },
+    {
+      "epoch": 1.7394984954864594,
+      "grad_norm": 0.048305802047252655,
+      "learning_rate": 8.81822211699016e-06,
+      "loss": 0.0184,
+      "step": 2710
+    },
+    {
+      "epoch": 1.7459177532597794,
+      "grad_norm": 0.055666640400886536,
+      "learning_rate": 8.396312836007259e-06,
+      "loss": 0.018,
+      "step": 2720
+    },
+    {
+      "epoch": 1.7523370110330991,
+      "grad_norm": 0.08541178703308105,
+      "learning_rate": 7.984303205675924e-06,
+      "loss": 0.0233,
+      "step": 2730
+    },
+    {
+      "epoch": 1.7587562688064193,
+      "grad_norm": 0.07454168796539307,
+      "learning_rate": 7.582237752057608e-06,
+      "loss": 0.0207,
+      "step": 2740
+    },
+    {
+      "epoch": 1.765175526579739,
+      "grad_norm": 0.04274304583668709,
+      "learning_rate": 7.19015992654225e-06,
+      "loss": 0.0207,
+      "step": 2750
+    },
+    {
+      "epoch": 1.7715947843530593,
+      "grad_norm": 0.03931812569499016,
+      "learning_rate": 6.808112101152419e-06,
+      "loss": 0.0187,
+      "step": 2760
+    },
+    {
+      "epoch": 1.778014042126379,
+      "grad_norm": 0.04860474541783333,
+      "learning_rate": 6.436135563964196e-06,
+      "loss": 0.0208,
+      "step": 2770
+    },
+    {
+      "epoch": 1.7844332998996992,
+      "grad_norm": 0.05222581326961517,
+      "learning_rate": 6.074270514645109e-06,
+      "loss": 0.0241,
+      "step": 2780
+    },
+    {
+      "epoch": 1.790852557673019,
+      "grad_norm": 0.07665343582630157,
+      "learning_rate": 5.722556060109751e-06,
+      "loss": 0.0232,
+      "step": 2790
+    },
+    {
+      "epoch": 1.7972718154463392,
+      "grad_norm": 0.05370553582906723,
+      "learning_rate": 5.381030210293503e-06,
+      "loss": 0.0158,
+      "step": 2800
+    },
+    {
+      "epoch": 1.803691073219659,
+      "grad_norm": 0.036142729222774506,
+      "learning_rate": 5.049729874044762e-06,
+      "loss": 0.0172,
+      "step": 2810
+    },
+    {
+      "epoch": 1.810110330992979,
+      "grad_norm": 0.03157337009906769,
+      "learning_rate": 4.7286908551361755e-06,
+      "loss": 0.0181,
+      "step": 2820
+    },
+    {
+      "epoch": 1.8165295887662989,
+      "grad_norm": 0.04949560761451721,
+      "learning_rate": 4.417947848395332e-06,
+      "loss": 0.0189,
+      "step": 2830
+    },
+    {
+      "epoch": 1.8229488465396189,
+      "grad_norm": 0.07635015994310379,
+      "learning_rate": 4.117534435955261e-06,
+      "loss": 0.0214,
+      "step": 2840
+    },
+    {
+      "epoch": 1.8293681043129388,
+      "grad_norm": 0.028284436091780663,
+      "learning_rate": 3.827483083625238e-06,
+      "loss": 0.0202,
+      "step": 2850
+    },
+    {
+      "epoch": 1.8357873620862588,
+      "grad_norm": 0.07148082554340363,
+      "learning_rate": 3.5478251373821103e-06,
+      "loss": 0.0189,
+      "step": 2860
+    },
+    {
+      "epoch": 1.8422066198595788,
+      "grad_norm": 0.03399994224309921,
+      "learning_rate": 3.2785908199828073e-06,
+      "loss": 0.0148,
+      "step": 2870
+    },
+    {
+      "epoch": 1.8486258776328985,
+      "grad_norm": 0.05979405716061592,
+      "learning_rate": 3.0198092276981004e-06,
+      "loss": 0.0191,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8550451354062187,
+      "grad_norm": 0.03924340382218361,
+      "learning_rate": 2.7715083271681706e-06,
+      "loss": 0.02,
+      "step": 2890
+    },
+    {
+      "epoch": 1.8614643931795385,
+      "grad_norm": 0.05604667589068413,
+      "learning_rate": 2.5337149523802615e-06,
+      "loss": 0.0198,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8678836509528587,
+      "grad_norm": 0.050481703132390976,
+      "learning_rate": 2.306454801768676e-06,
+      "loss": 0.0183,
+      "step": 2910
+    },
+    {
+      "epoch": 1.8743029087261784,
+      "grad_norm": 0.05771298334002495,
+      "learning_rate": 2.0897524354375753e-06,
+      "loss": 0.0212,
+      "step": 2920
+    },
+    {
+      "epoch": 1.8807221664994986,
+      "grad_norm": 0.05019890144467354,
+      "learning_rate": 1.8836312725067474e-06,
+      "loss": 0.0162,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8871414242728184,
+      "grad_norm": 0.08003148436546326,
+      "learning_rate": 1.6881135885806753e-06,
+      "loss": 0.0251,
+      "step": 2940
+    },
+    {
+      "epoch": 1.8935606820461384,
+      "grad_norm": 0.04569955915212631,
+      "learning_rate": 1.5032205133412192e-06,
+      "loss": 0.0184,
+      "step": 2950
+    },
+    {
+      "epoch": 1.8999799398194583,
+      "grad_norm": 0.02558874897658825,
+      "learning_rate": 1.3289720282641306e-06,
+      "loss": 0.0194,
+      "step": 2960
+    },
+    {
+      "epoch": 1.9063991975927783,
+      "grad_norm": 0.03270651400089264,
+      "learning_rate": 1.1653869644596027e-06,
+      "loss": 0.0178,
+      "step": 2970
+    },
+    {
+      "epoch": 1.9128184553660983,
+      "grad_norm": 0.041952360421419144,
+      "learning_rate": 1.0124830006372432e-06,
+      "loss": 0.0226,
+      "step": 2980
+    },
+    {
+      "epoch": 1.9192377131394183,
+      "grad_norm": 0.02816896326839924,
+      "learning_rate": 8.702766611954793e-07,
+      "loss": 0.0154,
+      "step": 2990
+    },
+    {
+      "epoch": 1.9256569709127382,
+      "grad_norm": 0.04016564413905144,
+      "learning_rate": 7.387833144358092e-07,
+      "loss": 0.0181,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9256569709127382,
+      "eval_loss": 0.03883149474859238,
+      "eval_runtime": 128.8321,
+      "eval_samples_per_second": 10.207,
+      "eval_steps_per_second": 10.207,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3116,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6478518696874035e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cac3f796d8e8ed78082f710e5c9ee0db63889906ddeefcc55981584c04b6c12
+size 5624

checkpoint-3000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3116/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Coder-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Coder-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoint-3116/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-3116/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a6ac0c24d4ae2eeeef86c40ba44bcc16cf0bba031ca3b75c17802bafa947cb3
+size 645975704

checkpoint-3116/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-3116/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-3116/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3116/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88efef1f9b42e1291da293392129540a53087aa78784b22383e02cc46618f309
+size 1292176234

checkpoint-3116/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:853400c18a635fcb248ef40716998b342a6d8224df37eb5ad963cdbec4381fe5
+size 14244

checkpoint-3116/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd4763d7357a39af659f1aa296cd5d56ccf4acbcffe350d544c7adf4bfb60a3
+size 1064

checkpoint-3116/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-3116/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83396048d512ec1f3178af0d7c1f79a226bba041822614b0e26a4fd2d4b55bf7
+size 11421995

checkpoint-3116/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-3116/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2259 @@

+{
+  "best_global_step": 3000,
+  "best_metric": 0.03883149474859238,
+  "best_model_checkpoint": "./checkpoints/checkpoint-3000",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 3116,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0064192577733199595,
+      "grad_norm": 0.734896719455719,
+      "learning_rate": 1.9148936170212766e-05,
+      "loss": 1.2808,
+      "step": 10
+    },
+    {
+      "epoch": 0.012838515546639919,
+      "grad_norm": 0.5790244340896606,
+      "learning_rate": 4.0425531914893614e-05,
+      "loss": 1.065,
+      "step": 20
+    },
+    {
+      "epoch": 0.01925777331995988,
+      "grad_norm": 0.20778240263462067,
+      "learning_rate": 6.170212765957447e-05,
+      "loss": 0.6486,
+      "step": 30
+    },
+    {
+      "epoch": 0.025677031093279838,
+      "grad_norm": 0.17519450187683105,
+      "learning_rate": 8.297872340425533e-05,
+      "loss": 0.559,
+      "step": 40
+    },
+    {
+      "epoch": 0.0320962888665998,
+      "grad_norm": 0.2367718517780304,
+      "learning_rate": 0.00010425531914893618,
+      "loss": 0.4906,
+      "step": 50
+    },
+    {
+      "epoch": 0.03851554663991976,
+      "grad_norm": 0.24615059792995453,
+      "learning_rate": 0.00012553191489361702,
+      "loss": 0.4413,
+      "step": 60
+    },
+    {
+      "epoch": 0.04493480441323972,
+      "grad_norm": 0.22158223390579224,
+      "learning_rate": 0.00014680851063829788,
+      "loss": 0.386,
+      "step": 70
+    },
+    {
+      "epoch": 0.051354062186559676,
+      "grad_norm": 0.18299002945423126,
+      "learning_rate": 0.00016808510638297873,
+      "loss": 0.37,
+      "step": 80
+    },
+    {
+      "epoch": 0.05777331995987964,
+      "grad_norm": 0.483330100774765,
+      "learning_rate": 0.00018936170212765957,
+      "loss": 0.408,
+      "step": 90
+    },
+    {
+      "epoch": 0.0641925777331996,
+      "grad_norm": 0.36728036403656006,
+      "learning_rate": 0.00019999864911039267,
+      "loss": 0.3986,
+      "step": 100
+    },
+    {
+      "epoch": 0.07061183550651956,
+      "grad_norm": 0.23549625277519226,
+      "learning_rate": 0.00019998784221252132,
+      "loss": 0.3367,
+      "step": 110
+    },
+    {
+      "epoch": 0.07703109327983952,
+      "grad_norm": 0.2653771936893463,
+      "learning_rate": 0.0001999662295846848,
+      "loss": 0.3457,
+      "step": 120
+    },
+    {
+      "epoch": 0.08345035105315948,
+      "grad_norm": 0.21738414466381073,
+      "learning_rate": 0.0001999338135625693,
+      "loss": 0.3075,
+      "step": 130
+    },
+    {
+      "epoch": 0.08986960882647944,
+      "grad_norm": 0.2068450152873993,
+      "learning_rate": 0.00019989059764938857,
+      "loss": 0.3423,
+      "step": 140
+    },
+    {
+      "epoch": 0.09628886659979939,
+      "grad_norm": 0.24901063740253448,
+      "learning_rate": 0.00019983658651550522,
+      "loss": 0.2767,
+      "step": 150
+    },
+    {
+      "epoch": 0.10270812437311935,
+      "grad_norm": 0.34664201736450195,
+      "learning_rate": 0.00019977178599792623,
+      "loss": 0.297,
+      "step": 160
+    },
+    {
+      "epoch": 0.10912738214643931,
+      "grad_norm": 0.29314759373664856,
+      "learning_rate": 0.00019969620309967198,
+      "loss": 0.2975,
+      "step": 170
+    },
+    {
+      "epoch": 0.11554663991975928,
+      "grad_norm": 0.3220290541648865,
+      "learning_rate": 0.0001996098459890194,
+      "loss": 0.2717,
+      "step": 180
+    },
+    {
+      "epoch": 0.12196589769307924,
+      "grad_norm": 0.26825714111328125,
+      "learning_rate": 0.00019951272399861938,
+      "loss": 0.262,
+      "step": 190
+    },
+    {
+      "epoch": 0.1283851554663992,
+      "grad_norm": 0.2678840458393097,
+      "learning_rate": 0.00019940484762448794,
+      "loss": 0.2439,
+      "step": 200
+    },
+    {
+      "epoch": 0.13480441323971915,
+      "grad_norm": 0.26287031173706055,
+      "learning_rate": 0.00019928622852487216,
+      "loss": 0.2463,
+      "step": 210
+    },
+    {
+      "epoch": 0.14122367101303912,
+      "grad_norm": 0.2625996172428131,
+      "learning_rate": 0.00019915687951899025,
+      "loss": 0.2158,
+      "step": 220
+    },
+    {
+      "epoch": 0.14764292878635907,
+      "grad_norm": 0.21532030403614044,
+      "learning_rate": 0.00019901681458564592,
+      "loss": 0.2505,
+      "step": 230
+    },
+    {
+      "epoch": 0.15406218655967904,
+      "grad_norm": 0.18523766100406647,
+      "learning_rate": 0.00019886604886171797,
+      "loss": 0.2648,
+      "step": 240
+    },
+    {
+      "epoch": 0.160481444332999,
+      "grad_norm": 0.1750323474407196,
+      "learning_rate": 0.00019870459864052435,
+      "loss": 0.1999,
+      "step": 250
+    },
+    {
+      "epoch": 0.16690070210631897,
+      "grad_norm": 0.21571072936058044,
+      "learning_rate": 0.00019853248137006123,
+      "loss": 0.2513,
+      "step": 260
+    },
+    {
+      "epoch": 0.1733199598796389,
+      "grad_norm": 0.4137316942214966,
+      "learning_rate": 0.00019834971565111758,
+      "loss": 0.2365,
+      "step": 270
+    },
+    {
+      "epoch": 0.1797392176529589,
+      "grad_norm": 0.1867353618144989,
+      "learning_rate": 0.0001981563212352648,
+      "loss": 0.2136,
+      "step": 280
+    },
+    {
+      "epoch": 0.18615847542627884,
+      "grad_norm": 0.24344463646411896,
+      "learning_rate": 0.0001979523190227222,
+      "loss": 0.2051,
+      "step": 290
+    },
+    {
+      "epoch": 0.19257773319959878,
+      "grad_norm": 0.32625189423561096,
+      "learning_rate": 0.0001977377310600984,
+      "loss": 0.2142,
+      "step": 300
+    },
+    {
+      "epoch": 0.19899699097291876,
+      "grad_norm": 0.2645917236804962,
+      "learning_rate": 0.00019751258053800865,
+      "loss": 0.2169,
+      "step": 310
+    },
+    {
+      "epoch": 0.2054162487462387,
+      "grad_norm": 0.2912643849849701,
+      "learning_rate": 0.0001972768917885686,
+      "loss": 0.2276,
+      "step": 320
+    },
+    {
+      "epoch": 0.21183550651955868,
+      "grad_norm": 0.18374884128570557,
+      "learning_rate": 0.00019703069028276482,
+      "loss": 0.181,
+      "step": 330
+    },
+    {
+      "epoch": 0.21825476429287863,
+      "grad_norm": 0.22864000499248505,
+      "learning_rate": 0.000196774002627702,
+      "loss": 0.2036,
+      "step": 340
+    },
+    {
+      "epoch": 0.2246740220661986,
+      "grad_norm": 0.1839989423751831,
+      "learning_rate": 0.00019650685656372763,
+      "loss": 0.1646,
+      "step": 350
+    },
+    {
+      "epoch": 0.23109327983951855,
+      "grad_norm": 0.2861062288284302,
+      "learning_rate": 0.000196229280961434,
+      "loss": 0.1854,
+      "step": 360
+    },
+    {
+      "epoch": 0.23751253761283853,
+      "grad_norm": 0.19540704786777496,
+      "learning_rate": 0.00019594130581853823,
+      "loss": 0.1444,
+      "step": 370
+    },
+    {
+      "epoch": 0.24393179538615847,
+      "grad_norm": 0.33926671743392944,
+      "learning_rate": 0.0001956429622566403,
+      "loss": 0.1933,
+      "step": 380
+    },
+    {
+      "epoch": 0.2503510531594784,
+      "grad_norm": 0.3747280538082123,
+      "learning_rate": 0.00019533428251785983,
+      "loss": 0.1932,
+      "step": 390
+    },
+    {
+      "epoch": 0.2567703109327984,
+      "grad_norm": 0.1787070333957672,
+      "learning_rate": 0.00019501529996135156,
+      "loss": 0.1855,
+      "step": 400
+    },
+    {
+      "epoch": 0.26318956870611837,
+      "grad_norm": 0.21175768971443176,
+      "learning_rate": 0.00019468604905970033,
+      "loss": 0.2053,
+      "step": 410
+    },
+    {
+      "epoch": 0.2696088264794383,
+      "grad_norm": 0.22584153711795807,
+      "learning_rate": 0.00019434656539519548,
+      "loss": 0.1551,
+      "step": 420
+    },
+    {
+      "epoch": 0.27602808425275827,
+      "grad_norm": 0.20716825127601624,
+      "learning_rate": 0.00019399688565598547,
+      "loss": 0.1708,
+      "step": 430
+    },
+    {
+      "epoch": 0.28244734202607824,
+      "grad_norm": 0.21005718410015106,
+      "learning_rate": 0.0001936370476321132,
+      "loss": 0.1279,
+      "step": 440
+    },
+    {
+      "epoch": 0.2888665997993982,
+      "grad_norm": 0.209135502576828,
+      "learning_rate": 0.00019326709021143167,
+      "loss": 0.1584,
+      "step": 450
+    },
+    {
+      "epoch": 0.29528585757271814,
+      "grad_norm": 0.2232855260372162,
+      "learning_rate": 0.00019288705337540166,
+      "loss": 0.1562,
+      "step": 460
+    },
+    {
+      "epoch": 0.3017051153460381,
+      "grad_norm": 0.2212464064359665,
+      "learning_rate": 0.0001924969781947707,
+      "loss": 0.1733,
+      "step": 470
+    },
+    {
+      "epoch": 0.3081243731193581,
+      "grad_norm": 0.18040229380130768,
+      "learning_rate": 0.00019209690682513465,
+      "loss": 0.12,
+      "step": 480
+    },
+    {
+      "epoch": 0.31454363089267806,
+      "grad_norm": 0.16415567696094513,
+      "learning_rate": 0.0001916868825023819,
+      "loss": 0.1257,
+      "step": 490
+    },
+    {
+      "epoch": 0.320962888665998,
+      "grad_norm": 0.18324217200279236,
+      "learning_rate": 0.00019126694953802093,
+      "loss": 0.138,
+      "step": 500
+    },
+    {
+      "epoch": 0.320962888665998,
+      "eval_loss": 0.15367139875888824,
+      "eval_runtime": 127.7222,
+      "eval_samples_per_second": 10.296,
+      "eval_steps_per_second": 10.296,
+      "step": 500
+    },
+    {
+      "epoch": 0.32738214643931796,
+      "grad_norm": 0.15339615941047668,
+      "learning_rate": 0.00019083715331439134,
+      "loss": 0.1653,
+      "step": 510
+    },
+    {
+      "epoch": 0.33380140421263793,
+      "grad_norm": 0.2750616669654846,
+      "learning_rate": 0.00019039754027975952,
+      "loss": 0.1834,
+      "step": 520
+    },
+    {
+      "epoch": 0.34022066198595785,
+      "grad_norm": 0.20012134313583374,
+      "learning_rate": 0.00018994815794329896,
+      "loss": 0.1278,
+      "step": 530
+    },
+    {
+      "epoch": 0.3466399197592778,
+      "grad_norm": 0.24493156373500824,
+      "learning_rate": 0.0001894890548699559,
+      "loss": 0.142,
+      "step": 540
+    },
+    {
+      "epoch": 0.3530591775325978,
+      "grad_norm": 0.1628945767879486,
+      "learning_rate": 0.0001890202806752008,
+      "loss": 0.1287,
+      "step": 550
+    },
+    {
+      "epoch": 0.3594784353059178,
+      "grad_norm": 0.24853569269180298,
+      "learning_rate": 0.00018854188601966657,
+      "loss": 0.1397,
+      "step": 560
+    },
+    {
+      "epoch": 0.3658976930792377,
+      "grad_norm": 0.1540278196334839,
+      "learning_rate": 0.0001880539226036734,
+      "loss": 0.1343,
+      "step": 570
+    },
+    {
+      "epoch": 0.37231695085255767,
+      "grad_norm": 0.3391481935977936,
+      "learning_rate": 0.00018755644316164176,
+      "loss": 0.122,
+      "step": 580
+    },
+    {
+      "epoch": 0.37873620862587765,
+      "grad_norm": 0.19285668432712555,
+      "learning_rate": 0.0001870495014563931,
+      "loss": 0.1015,
+      "step": 590
+    },
+    {
+      "epoch": 0.38515546639919757,
+      "grad_norm": 0.19064725935459137,
+      "learning_rate": 0.00018653315227333992,
+      "loss": 0.1165,
+      "step": 600
+    },
+    {
+      "epoch": 0.39157472417251754,
+      "grad_norm": 0.21822167932987213,
+      "learning_rate": 0.00018600745141456485,
+      "loss": 0.1185,
+      "step": 610
+    },
+    {
+      "epoch": 0.3979939819458375,
+      "grad_norm": 0.21495632827281952,
+      "learning_rate": 0.0001854724556927903,
+      "loss": 0.1455,
+      "step": 620
+    },
+    {
+      "epoch": 0.4044132397191575,
+      "grad_norm": 0.21818260848522186,
+      "learning_rate": 0.00018492822292523863,
+      "loss": 0.1045,
+      "step": 630
+    },
+    {
+      "epoch": 0.4108324974924774,
+      "grad_norm": 0.2295123040676117,
+      "learning_rate": 0.0001843748119273837,
+      "loss": 0.1331,
+      "step": 640
+    },
+    {
+      "epoch": 0.4172517552657974,
+      "grad_norm": 0.3051343560218811,
+      "learning_rate": 0.0001838122825065948,
+      "loss": 0.1139,
+      "step": 650
+    },
+    {
+      "epoch": 0.42367101303911736,
+      "grad_norm": 0.24361808598041534,
+      "learning_rate": 0.0001832406954556732,
+      "loss": 0.1205,
+      "step": 660
+    },
+    {
+      "epoch": 0.43009027081243734,
+      "grad_norm": 0.19327011704444885,
+      "learning_rate": 0.00018266011254628218,
+      "loss": 0.087,
+      "step": 670
+    },
+    {
+      "epoch": 0.43650952858575726,
+      "grad_norm": 0.21391886472702026,
+      "learning_rate": 0.0001820705965222714,
+      "loss": 0.1079,
+      "step": 680
+    },
+    {
+      "epoch": 0.44292878635907723,
+      "grad_norm": 0.21966692805290222,
+      "learning_rate": 0.0001814722110928962,
+      "loss": 0.1188,
+      "step": 690
+    },
+    {
+      "epoch": 0.4493480441323972,
+      "grad_norm": 0.19202131032943726,
+      "learning_rate": 0.00018086502092593234,
+      "loss": 0.0991,
+      "step": 700
+    },
+    {
+      "epoch": 0.4557673019057171,
+      "grad_norm": 0.1604185700416565,
+      "learning_rate": 0.00018024909164068755,
+      "loss": 0.094,
+      "step": 710
+    },
+    {
+      "epoch": 0.4621865596790371,
+      "grad_norm": 0.21912045776844025,
+      "learning_rate": 0.00017962448980090982,
+      "loss": 0.1093,
+      "step": 720
+    },
+    {
+      "epoch": 0.4686058174523571,
+      "grad_norm": 0.16105616092681885,
+      "learning_rate": 0.00017899128290759395,
+      "loss": 0.0747,
+      "step": 730
+    },
+    {
+      "epoch": 0.47502507522567705,
+      "grad_norm": 0.19927042722702026,
+      "learning_rate": 0.00017834953939168663,
+      "loss": 0.1057,
+      "step": 740
+    },
+    {
+      "epoch": 0.48144433299899697,
+      "grad_norm": 0.2838548421859741,
+      "learning_rate": 0.00017769932860669111,
+      "loss": 0.0891,
+      "step": 750
+    },
+    {
+      "epoch": 0.48786359077231695,
+      "grad_norm": 0.16990593075752258,
+      "learning_rate": 0.00017704072082117215,
+      "loss": 0.0993,
+      "step": 760
+    },
+    {
+      "epoch": 0.4942828485456369,
+      "grad_norm": 0.22423957288265228,
+      "learning_rate": 0.00017637378721116197,
+      "loss": 0.0787,
+      "step": 770
+    },
+    {
+      "epoch": 0.5007021063189568,
+      "grad_norm": 0.224956676363945,
+      "learning_rate": 0.00017569859985246838,
+      "loss": 0.1019,
+      "step": 780
+    },
+    {
+      "epoch": 0.5071213640922768,
+      "grad_norm": 0.11727194488048553,
+      "learning_rate": 0.0001750152317128854,
+      "loss": 0.0806,
+      "step": 790
+    },
+    {
+      "epoch": 0.5135406218655968,
+      "grad_norm": 0.13485625386238098,
+      "learning_rate": 0.00017432375664430753,
+      "loss": 0.0873,
+      "step": 800
+    },
+    {
+      "epoch": 0.5199598796389168,
+      "grad_norm": 0.2887745201587677,
+      "learning_rate": 0.0001736242493747488,
+      "loss": 0.0796,
+      "step": 810
+    },
+    {
+      "epoch": 0.5263791374122367,
+      "grad_norm": 0.3110921084880829,
+      "learning_rate": 0.00017291678550026665,
+      "loss": 0.0912,
+      "step": 820
+    },
+    {
+      "epoch": 0.5327983951855567,
+      "grad_norm": 0.17630113661289215,
+      "learning_rate": 0.0001722014414767923,
+      "loss": 0.0795,
+      "step": 830
+    },
+    {
+      "epoch": 0.5392176529588766,
+      "grad_norm": 0.1666630357503891,
+      "learning_rate": 0.00017147829461186816,
+      "loss": 0.0791,
+      "step": 840
+    },
+    {
+      "epoch": 0.5456369107321966,
+      "grad_norm": 0.17798000574111938,
+      "learning_rate": 0.00017074742305629305,
+      "loss": 0.1046,
+      "step": 850
+    },
+    {
+      "epoch": 0.5520561685055165,
+      "grad_norm": 0.30929967761039734,
+      "learning_rate": 0.0001700089057956766,
+      "loss": 0.088,
+      "step": 860
+    },
+    {
+      "epoch": 0.5584754262788365,
+      "grad_norm": 0.12197452783584595,
+      "learning_rate": 0.00016926282264190313,
+      "loss": 0.0766,
+      "step": 870
+    },
+    {
+      "epoch": 0.5648946840521565,
+      "grad_norm": 0.254573255777359,
+      "learning_rate": 0.00016850925422450626,
+      "loss": 0.1153,
+      "step": 880
+    },
+    {
+      "epoch": 0.5713139418254765,
+      "grad_norm": 0.14357267320156097,
+      "learning_rate": 0.0001677482819819554,
+      "loss": 0.0919,
+      "step": 890
+    },
+    {
+      "epoch": 0.5777331995987964,
+      "grad_norm": 0.22417870163917542,
+      "learning_rate": 0.00016697998815285461,
+      "loss": 0.0771,
+      "step": 900
+    },
+    {
+      "epoch": 0.5841524573721163,
+      "grad_norm": 0.37406066060066223,
+      "learning_rate": 0.00016620445576705502,
+      "loss": 0.0783,
+      "step": 910
+    },
+    {
+      "epoch": 0.5905717151454363,
+      "grad_norm": 0.1301262080669403,
+      "learning_rate": 0.0001654217686366817,
+      "loss": 0.064,
+      "step": 920
+    },
+    {
+      "epoch": 0.5969909729187562,
+      "grad_norm": 0.13001087307929993,
+      "learning_rate": 0.0001646320113470761,
+      "loss": 0.0554,
+      "step": 930
+    },
+    {
+      "epoch": 0.6034102306920762,
+      "grad_norm": 0.1982804834842682,
+      "learning_rate": 0.00016383526924765494,
+      "loss": 0.0902,
+      "step": 940
+    },
+    {
+      "epoch": 0.6098294884653962,
+      "grad_norm": 0.14553162455558777,
+      "learning_rate": 0.0001630316284426864,
+      "loss": 0.073,
+      "step": 950
+    },
+    {
+      "epoch": 0.6162487462387162,
+      "grad_norm": 0.22459714114665985,
+      "learning_rate": 0.00016222117578198477,
+      "loss": 0.0874,
+      "step": 960
+    },
+    {
+      "epoch": 0.6226680040120361,
+      "grad_norm": 0.23248261213302612,
+      "learning_rate": 0.00016140399885152456,
+      "loss": 0.0607,
+      "step": 970
+    },
+    {
+      "epoch": 0.6290872617853561,
+      "grad_norm": 0.28904321789741516,
+      "learning_rate": 0.00016058018596397508,
+      "loss": 0.0708,
+      "step": 980
+    },
+    {
+      "epoch": 0.635506519558676,
+      "grad_norm": 0.18531730771064758,
+      "learning_rate": 0.00015974982614915643,
+      "loss": 0.064,
+      "step": 990
+    },
+    {
+      "epoch": 0.641925777331996,
+      "grad_norm": 0.16869930922985077,
+      "learning_rate": 0.00015891300914441803,
+      "loss": 0.0526,
+      "step": 1000
+    },
+    {
+      "epoch": 0.641925777331996,
+      "eval_loss": 0.08326917141675949,
+      "eval_runtime": 128.9614,
+      "eval_samples_per_second": 10.197,
+      "eval_steps_per_second": 10.197,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6483450351053159,
+      "grad_norm": 0.23396554589271545,
+      "learning_rate": 0.00015806982538494065,
+      "loss": 0.0624,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6547642928786359,
+      "grad_norm": 0.13306337594985962,
+      "learning_rate": 0.00015722036599396296,
+      "loss": 0.084,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6611835506519559,
+      "grad_norm": 0.14258597791194916,
+      "learning_rate": 0.00015636472277293396,
+      "loss": 0.0553,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6676028084252759,
+      "grad_norm": 0.17658811807632446,
+      "learning_rate": 0.00015550298819159189,
+      "loss": 0.0641,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6740220661985958,
+      "grad_norm": 0.22399111092090607,
+      "learning_rate": 0.0001546352553779709,
+      "loss": 0.0798,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6804413239719157,
+      "grad_norm": 0.25611406564712524,
+      "learning_rate": 0.0001537616181083368,
+      "loss": 0.062,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6868605817452357,
+      "grad_norm": 0.21297794580459595,
+      "learning_rate": 0.00015288217079705246,
+      "loss": 0.0557,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6932798395185557,
+      "grad_norm": 0.1601909101009369,
+      "learning_rate": 0.00015199700848637462,
+      "loss": 0.0838,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6996990972918756,
+      "grad_norm": 0.19593413174152374,
+      "learning_rate": 0.00015110622683618243,
+      "loss": 0.0578,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7061183550651956,
+      "grad_norm": 0.3398507237434387,
+      "learning_rate": 0.0001502099221136395,
+      "loss": 0.0738,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7125376128385156,
+      "grad_norm": 0.23959602415561676,
+      "learning_rate": 0.0001493081911827904,
+      "loss": 0.0562,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7189568706118356,
+      "grad_norm": 0.22912147641181946,
+      "learning_rate": 0.00014840113149409233,
+      "loss": 0.0708,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7253761283851554,
+      "grad_norm": 0.1650708019733429,
+      "learning_rate": 0.00014748884107388372,
+      "loss": 0.0552,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7317953861584754,
+      "grad_norm": 0.19332996010780334,
+      "learning_rate": 0.00014657141851379043,
+      "loss": 0.0473,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7382146439317954,
+      "grad_norm": 0.15983298420906067,
+      "learning_rate": 0.00014564896296007088,
+      "loss": 0.0465,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7446339017051153,
+      "grad_norm": 0.16740870475769043,
+      "learning_rate": 0.00014472157410290147,
+      "loss": 0.0482,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7510531594784353,
+      "grad_norm": 0.17517174780368805,
+      "learning_rate": 0.00014378935216560268,
+      "loss": 0.0678,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7574724172517553,
+      "grad_norm": 0.2131085991859436,
+      "learning_rate": 0.00014285239789380824,
+      "loss": 0.0436,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7638916750250753,
+      "grad_norm": 0.17330893874168396,
+      "learning_rate": 0.00014191081254457725,
+      "loss": 0.0504,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7703109327983951,
+      "grad_norm": 0.15023717284202576,
+      "learning_rate": 0.0001409646978754514,
+      "loss": 0.0531,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7767301905717151,
+      "grad_norm": 0.10652194917201996,
+      "learning_rate": 0.00014001415613345793,
+      "loss": 0.0468,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7831494483450351,
+      "grad_norm": 0.11492206156253815,
+      "learning_rate": 0.00013905929004405992,
+      "loss": 0.0497,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7895687061183551,
+      "grad_norm": 0.27618470788002014,
+      "learning_rate": 0.00013810020280005441,
+      "loss": 0.0696,
+      "step": 1230
+    },
+    {
+      "epoch": 0.795987963891675,
+      "grad_norm": 0.10219205170869827,
+      "learning_rate": 0.00013713699805042057,
+      "loss": 0.0514,
+      "step": 1240
+    },
+    {
+      "epoch": 0.802407221664995,
+      "grad_norm": 0.1219983845949173,
+      "learning_rate": 0.00013616977988911821,
+      "loss": 0.0378,
+      "step": 1250
+    },
+    {
+      "epoch": 0.808826479438315,
+      "grad_norm": 0.1977282017469406,
+      "learning_rate": 0.00013519865284383818,
+      "loss": 0.0639,
+      "step": 1260
+    },
+    {
+      "epoch": 0.815245737211635,
+      "grad_norm": 0.11138994246721268,
+      "learning_rate": 0.00013422372186470632,
+      "loss": 0.0575,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8216649949849548,
+      "grad_norm": 0.1591784507036209,
+      "learning_rate": 0.00013324509231294108,
+      "loss": 0.0399,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8280842527582748,
+      "grad_norm": 0.12934090197086334,
+      "learning_rate": 0.00013226286994946746,
+      "loss": 0.0469,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8345035105315948,
+      "grad_norm": 0.13661682605743408,
+      "learning_rate": 0.00013127716092348708,
+      "loss": 0.0441,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8409227683049147,
+      "grad_norm": 0.25683388113975525,
+      "learning_rate": 0.0001302880717610067,
+      "loss": 0.0644,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8473420260782347,
+      "grad_norm": 0.14545147120952606,
+      "learning_rate": 0.00012929570935332597,
+      "loss": 0.0446,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8537612838515547,
+      "grad_norm": 0.16420190036296844,
+      "learning_rate": 0.0001283001809454856,
+      "loss": 0.0597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8601805416248747,
+      "grad_norm": 0.2332782745361328,
+      "learning_rate": 0.00012730159412467726,
+      "loss": 0.0435,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8665997993981945,
+      "grad_norm": 0.09255204349756241,
+      "learning_rate": 0.00012630005680861668,
+      "loss": 0.0528,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8730190571715145,
+      "grad_norm": 0.2076631337404251,
+      "learning_rate": 0.00012529567723388088,
+      "loss": 0.0525,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8794383149448345,
+      "grad_norm": 0.17183755338191986,
+      "learning_rate": 0.0001242885639442111,
+      "loss": 0.0435,
+      "step": 1370
+    },
+    {
+      "epoch": 0.8858575727181545,
+      "grad_norm": 0.1666991412639618,
+      "learning_rate": 0.00012327882577878226,
+      "loss": 0.0403,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8922768304914744,
+      "grad_norm": 0.1412462741136551,
+      "learning_rate": 0.00012226657186044086,
+      "loss": 0.0512,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8986960882647944,
+      "grad_norm": 0.13901075720787048,
+      "learning_rate": 0.00012125191158391178,
+      "loss": 0.0311,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9051153460381144,
+      "grad_norm": 0.1401802897453308,
+      "learning_rate": 0.00012023495460397614,
+      "loss": 0.0383,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9115346038114343,
+      "grad_norm": 0.18335814774036407,
+      "learning_rate": 0.00011921581082362092,
+      "loss": 0.0355,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9179538615847542,
+      "grad_norm": 0.1105201244354248,
+      "learning_rate": 0.00011819459038216143,
+      "loss": 0.0433,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9243731193580742,
+      "grad_norm": 0.10848142206668854,
+      "learning_rate": 0.00011717140364333874,
+      "loss": 0.0333,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9307923771313942,
+      "grad_norm": 0.07852191478013992,
+      "learning_rate": 0.00011614636118339249,
+      "loss": 0.0466,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9372116349047142,
+      "grad_norm": 0.16577693819999695,
+      "learning_rate": 0.00011511957377911091,
+      "loss": 0.0533,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9436308926780341,
+      "grad_norm": 0.12513399124145508,
+      "learning_rate": 0.00011409115239585921,
+      "loss": 0.0371,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9500501504513541,
+      "grad_norm": 0.24522411823272705,
+      "learning_rate": 0.00011306120817558736,
+      "loss": 0.0463,
+      "step": 1480
+    },
+    {
+      "epoch": 0.956469408224674,
+      "grad_norm": 0.1543067842721939,
+      "learning_rate": 0.00011202985242481898,
+      "loss": 0.0411,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "grad_norm": 0.10811112076044083,
+      "learning_rate": 0.00011099719660262243,
+      "loss": 0.043,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9628886659979939,
+      "eval_loss": 0.057985104620456696,
+      "eval_runtime": 128.7458,
+      "eval_samples_per_second": 10.214,
+      "eval_steps_per_second": 10.214,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9693079237713139,
+      "grad_norm": 0.10716786235570908,
+      "learning_rate": 0.00010996335230856538,
+      "loss": 0.034,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9757271815446339,
+      "grad_norm": 0.12667109072208405,
+      "learning_rate": 0.00010892843127065416,
+      "loss": 0.0382,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9821464393179539,
+      "grad_norm": 0.22383131086826324,
+      "learning_rate": 0.00010789254533325929,
+      "loss": 0.0491,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9885656970912738,
+      "grad_norm": 0.09304796904325485,
+      "learning_rate": 0.00010685580644502837,
+      "loss": 0.0354,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9949849548645938,
+      "grad_norm": 0.07743366062641144,
+      "learning_rate": 0.0001058183266467878,
+      "loss": 0.042,
+      "step": 1550
+    },
+    {
+      "epoch": 1.001283851554664,
+      "grad_norm": 0.15325789153575897,
+      "learning_rate": 0.00010478021805943445,
+      "loss": 0.039,
+      "step": 1560
+    },
+    {
+      "epoch": 1.007703109327984,
+      "grad_norm": 0.07142600417137146,
+      "learning_rate": 0.00010374159287181868,
+      "loss": 0.0299,
+      "step": 1570
+    },
+    {
+      "epoch": 1.014122367101304,
+      "grad_norm": 0.07669597119092941,
+      "learning_rate": 0.00010270256332862014,
+      "loss": 0.0301,
+      "step": 1580
+    },
+    {
+      "epoch": 1.020541624874624,
+      "grad_norm": 0.07297486811876297,
+      "learning_rate": 0.00010166324171821721,
+      "loss": 0.0347,
+      "step": 1590
+    },
+    {
+      "epoch": 1.026960882647944,
+      "grad_norm": 0.20420107245445251,
+      "learning_rate": 0.00010062374036055226,
+      "loss": 0.0307,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0333801404212637,
+      "grad_norm": 0.09947313368320465,
+      "learning_rate": 9.958417159499298e-05,
+      "loss": 0.0307,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0397993981945837,
+      "grad_norm": 0.11809804290533066,
+      "learning_rate": 9.85446477681918e-05,
+      "loss": 0.0299,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0462186559679036,
+      "grad_norm": 0.2631526291370392,
+      "learning_rate": 9.750528122194467e-05,
+      "loss": 0.0387,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0526379137412236,
+      "grad_norm": 0.08276328444480896,
+      "learning_rate": 9.646618428105013e-05,
+      "loss": 0.0305,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0590571715145436,
+      "grad_norm": 0.11126093566417694,
+      "learning_rate": 9.542746924117037e-05,
+      "loss": 0.0315,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0654764292878636,
+      "grad_norm": 0.125727117061615,
+      "learning_rate": 9.438924835669532e-05,
+      "loss": 0.0272,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0718956870611835,
+      "grad_norm": 0.13635793328285217,
+      "learning_rate": 9.33516338286114e-05,
+      "loss": 0.0339,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0783149448345035,
+      "grad_norm": 0.1484992802143097,
+      "learning_rate": 9.231473779237579e-05,
+      "loss": 0.0301,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0847342026078235,
+      "grad_norm": 0.12370124459266663,
+      "learning_rate": 9.127867230579788e-05,
+      "loss": 0.0244,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0911534603811435,
+      "grad_norm": 0.11567872017621994,
+      "learning_rate": 9.024354933692935e-05,
+      "loss": 0.0291,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0975727181544634,
+      "grad_norm": 0.09546232968568802,
+      "learning_rate": 8.920948075196332e-05,
+      "loss": 0.0229,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1039919759277834,
+      "grad_norm": 0.05330915004014969,
+      "learning_rate": 8.817657830314546e-05,
+      "loss": 0.029,
+      "step": 1720
+    },
+    {
+      "epoch": 1.1104112337011034,
+      "grad_norm": 0.07367228716611862,
+      "learning_rate": 8.714495361669644e-05,
+      "loss": 0.0316,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1168304914744234,
+      "grad_norm": 0.056328821927309036,
+      "learning_rate": 8.61147181807486e-05,
+      "loss": 0.0256,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1232497492477433,
+      "grad_norm": 0.11536078155040741,
+      "learning_rate": 8.508598333329744e-05,
+      "loss": 0.0234,
+      "step": 1750
+    },
+    {
+      "epoch": 1.129669007021063,
+      "grad_norm": 0.09325211495161057,
+      "learning_rate": 8.405886025016911e-05,
+      "loss": 0.0316,
+      "step": 1760
+    },
+    {
+      "epoch": 1.136088264794383,
+      "grad_norm": 0.06355856359004974,
+      "learning_rate": 8.303345993300575e-05,
+      "loss": 0.0255,
+      "step": 1770
+    },
+    {
+      "epoch": 1.142507522567703,
+      "grad_norm": 0.11567080020904541,
+      "learning_rate": 8.200989319726937e-05,
+      "loss": 0.0253,
+      "step": 1780
+    },
+    {
+      "epoch": 1.148926780341023,
+      "grad_norm": 0.11177469789981842,
+      "learning_rate": 8.098827066026615e-05,
+      "loss": 0.025,
+      "step": 1790
+    },
+    {
+      "epoch": 1.155346038114343,
+      "grad_norm": 0.12497933954000473,
+      "learning_rate": 7.996870272919165e-05,
+      "loss": 0.0274,
+      "step": 1800
+    },
+    {
+      "epoch": 1.161765295887663,
+      "grad_norm": 0.11924732476472855,
+      "learning_rate": 7.895129958919947e-05,
+      "loss": 0.0321,
+      "step": 1810
+    },
+    {
+      "epoch": 1.168184553660983,
+      "grad_norm": 0.08141748607158661,
+      "learning_rate": 7.793617119149319e-05,
+      "loss": 0.0262,
+      "step": 1820
+    },
+    {
+      "epoch": 1.174603811434303,
+      "grad_norm": 0.11012791097164154,
+      "learning_rate": 7.692342724144397e-05,
+      "loss": 0.0226,
+      "step": 1830
+    },
+    {
+      "epoch": 1.181023069207623,
+      "grad_norm": 0.12142367660999298,
+      "learning_rate": 7.59131771867348e-05,
+      "loss": 0.0302,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1874423269809429,
+      "grad_norm": 0.04989106208086014,
+      "learning_rate": 7.490553020553214e-05,
+      "loss": 0.0297,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1938615847542629,
+      "grad_norm": 0.11036371439695358,
+      "learning_rate": 7.390059519468726e-05,
+      "loss": 0.0287,
+      "step": 1860
+    },
+    {
+      "epoch": 1.2002808425275828,
+      "grad_norm": 0.09587734192609787,
+      "learning_rate": 7.289848075796755e-05,
+      "loss": 0.022,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2067001003009028,
+      "grad_norm": 0.1048700138926506,
+      "learning_rate": 7.189929519431982e-05,
+      "loss": 0.0246,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2131193580742226,
+      "grad_norm": 0.1045086681842804,
+      "learning_rate": 7.090314648616607e-05,
+      "loss": 0.0245,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2195386158475428,
+      "grad_norm": 0.08027637004852295,
+      "learning_rate": 6.991014228773421e-05,
+      "loss": 0.027,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2259578736208625,
+      "grad_norm": 0.09890926629304886,
+      "learning_rate": 6.892038991342349e-05,
+      "loss": 0.0266,
+      "step": 1910
+    },
+    {
+      "epoch": 1.2323771313941825,
+      "grad_norm": 0.06575173139572144,
+      "learning_rate": 6.793399632620715e-05,
+      "loss": 0.0232,
+      "step": 1920
+    },
+    {
+      "epoch": 1.2387963891675025,
+      "grad_norm": 0.07365952432155609,
+      "learning_rate": 6.695106812607282e-05,
+      "loss": 0.0222,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2452156469408224,
+      "grad_norm": 0.1399465799331665,
+      "learning_rate": 6.597171153850219e-05,
+      "loss": 0.0239,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2516349047141424,
+      "grad_norm": 0.07138116657733917,
+      "learning_rate": 6.499603240299133e-05,
+      "loss": 0.0261,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2580541624874624,
+      "grad_norm": 0.07462996989488602,
+      "learning_rate": 6.40241361616123e-05,
+      "loss": 0.0263,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2644734202607824,
+      "grad_norm": 0.06026960164308548,
+      "learning_rate": 6.305612784761823e-05,
+      "loss": 0.0286,
+      "step": 1970
+    },
+    {
+      "epoch": 1.2708926780341023,
+      "grad_norm": 0.04298267886042595,
+      "learning_rate": 6.209211207409225e-05,
+      "loss": 0.0255,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2773119358074223,
+      "grad_norm": 0.053071849048137665,
+      "learning_rate": 6.113219302264174e-05,
+      "loss": 0.0217,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "grad_norm": 0.05462060496211052,
+      "learning_rate": 6.017647443213974e-05,
+      "loss": 0.0246,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2837311935807423,
+      "eval_loss": 0.046459589153528214,
+      "eval_runtime": 128.8944,
+      "eval_samples_per_second": 10.202,
+      "eval_steps_per_second": 10.202,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2901504513540623,
+      "grad_norm": 0.052726052701473236,
+      "learning_rate": 5.9225059587513454e-05,
+      "loss": 0.0227,
+      "step": 2010
+    },
+    {
+      "epoch": 1.296569709127382,
+      "grad_norm": 0.05038246139883995,
+      "learning_rate": 5.8278051308582505e-05,
+      "loss": 0.0273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.3029889669007022,
+      "grad_norm": 0.09061837941408157,
+      "learning_rate": 5.733555193894695e-05,
+      "loss": 0.0296,
+      "step": 2030
+    },
+    {
+      "epoch": 1.309408224674022,
+      "grad_norm": 0.11903833597898483,
+      "learning_rate": 5.6397663334927096e-05,
+      "loss": 0.0309,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3158274824473422,
+      "grad_norm": 0.05378040671348572,
+      "learning_rate": 5.5464486854555744e-05,
+      "loss": 0.0236,
+      "step": 2050
+    },
+    {
+      "epoch": 1.322246740220662,
+      "grad_norm": 0.049429334700107574,
+      "learning_rate": 5.453612334662446e-05,
+      "loss": 0.0248,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3286659979939819,
+      "grad_norm": 0.0931214764714241,
+      "learning_rate": 5.361267313978472e-05,
+      "loss": 0.0319,
+      "step": 2070
+    },
+    {
+      "epoch": 1.3350852557673019,
+      "grad_norm": 0.0589577853679657,
+      "learning_rate": 5.2694236031705446e-05,
+      "loss": 0.0253,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3415045135406218,
+      "grad_norm": 0.10711564123630524,
+      "learning_rate": 5.178091127828777e-05,
+      "loss": 0.0283,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3479237713139418,
+      "grad_norm": 0.0872858390212059,
+      "learning_rate": 5.087279758293837e-05,
+      "loss": 0.0237,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3543430290872618,
+      "grad_norm": 0.05081092566251755,
+      "learning_rate": 4.996999308590266e-05,
+      "loss": 0.0246,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3607622868605818,
+      "grad_norm": 0.06178516149520874,
+      "learning_rate": 4.907259535365859e-05,
+      "loss": 0.0238,
+      "step": 2120
+    },
+    {
+      "epoch": 1.3671815446339017,
+      "grad_norm": 0.054508257657289505,
+      "learning_rate": 4.818070136837275e-05,
+      "loss": 0.0209,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3736008024072217,
+      "grad_norm": 0.1331976354122162,
+      "learning_rate": 4.72944075174193e-05,
+      "loss": 0.022,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3800200601805417,
+      "grad_norm": 0.06243107095360756,
+      "learning_rate": 4.6413809582963484e-05,
+      "loss": 0.0172,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3864393179538617,
+      "grad_norm": 0.08247397094964981,
+      "learning_rate": 4.553900273161036e-05,
+      "loss": 0.0213,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3928585757271814,
+      "grad_norm": 0.06733040511608124,
+      "learning_rate": 4.467008150412e-05,
+      "loss": 0.0247,
+      "step": 2170
+    },
+    {
+      "epoch": 1.3992778335005016,
+      "grad_norm": 0.05423252657055855,
+      "learning_rate": 4.3807139805190613e-05,
+      "loss": 0.0247,
+      "step": 2180
+    },
+    {
+      "epoch": 1.4056970912738214,
+      "grad_norm": 0.10230846703052521,
+      "learning_rate": 4.295027089331013e-05,
+      "loss": 0.0212,
+      "step": 2190
+    },
+    {
+      "epoch": 1.4121163490471416,
+      "grad_norm": 0.05248994752764702,
+      "learning_rate": 4.2099567370677687e-05,
+      "loss": 0.0177,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4185356068204613,
+      "grad_norm": 0.07035645842552185,
+      "learning_rate": 4.125512117319612e-05,
+      "loss": 0.021,
+      "step": 2210
+    },
+    {
+      "epoch": 1.4249548645937813,
+      "grad_norm": 0.10099633783102036,
+      "learning_rate": 4.041702356053639e-05,
+      "loss": 0.0212,
+      "step": 2220
+    },
+    {
+      "epoch": 1.4313741223671013,
+      "grad_norm": 0.13029153645038605,
+      "learning_rate": 3.958536510627511e-05,
+      "loss": 0.0191,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4377933801404212,
+      "grad_norm": 0.06044310703873634,
+      "learning_rate": 3.876023568810622e-05,
+      "loss": 0.0205,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4442126379137412,
+      "grad_norm": 0.07602677494287491,
+      "learning_rate": 3.794172447812785e-05,
+      "loss": 0.0184,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4506318956870612,
+      "grad_norm": 0.08735393732786179,
+      "learning_rate": 3.7129919933205536e-05,
+      "loss": 0.0251,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4570511534603812,
+      "grad_norm": 0.04306063801050186,
+      "learning_rate": 3.6324909785412445e-05,
+      "loss": 0.0183,
+      "step": 2270
+    },
+    {
+      "epoch": 1.4634704112337011,
+      "grad_norm": 0.05602416768670082,
+      "learning_rate": 3.552678103254838e-05,
+      "loss": 0.0229,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4698896690070211,
+      "grad_norm": 0.09133511781692505,
+      "learning_rate": 3.4735619928737764e-05,
+      "loss": 0.0211,
+      "step": 2290
+    },
+    {
+      "epoch": 1.476308926780341,
+      "grad_norm": 0.10781540721654892,
+      "learning_rate": 3.395151197510804e-05,
+      "loss": 0.0198,
+      "step": 2300
+    },
+    {
+      "epoch": 1.482728184553661,
+      "grad_norm": 0.06286901980638504,
+      "learning_rate": 3.3174541910549784e-05,
+      "loss": 0.0221,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4891474423269808,
+      "grad_norm": 0.046533193439245224,
+      "learning_rate": 3.2404793702558636e-05,
+      "loss": 0.0188,
+      "step": 2320
+    },
+    {
+      "epoch": 1.495566700100301,
+      "grad_norm": 0.034077636897563934,
+      "learning_rate": 3.1642350538161045e-05,
+      "loss": 0.017,
+      "step": 2330
+    },
+    {
+      "epoch": 1.5019859578736208,
+      "grad_norm": 0.053730156272649765,
+      "learning_rate": 3.088729481492424e-05,
+      "loss": 0.0204,
+      "step": 2340
+    },
+    {
+      "epoch": 1.508405215646941,
+      "grad_norm": 0.060676686465740204,
+      "learning_rate": 3.0139708132051424e-05,
+      "loss": 0.0201,
+      "step": 2350
+    },
+    {
+      "epoch": 1.5148244734202607,
+      "grad_norm": 0.09910279512405396,
+      "learning_rate": 2.939967128156328e-05,
+      "loss": 0.0173,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5212437311935807,
+      "grad_norm": 0.0654776319861412,
+      "learning_rate": 2.866726423956687e-05,
+      "loss": 0.0233,
+      "step": 2370
+    },
+    {
+      "epoch": 1.5276629889669007,
+      "grad_norm": 0.08738186955451965,
+      "learning_rate": 2.794256615761247e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5340822467402206,
+      "grad_norm": 0.12039466947317123,
+      "learning_rate": 2.7225655354139677e-05,
+      "loss": 0.0193,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5405015045135406,
+      "grad_norm": 0.12398708611726761,
+      "learning_rate": 2.6516609306013462e-05,
+      "loss": 0.024,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5469207622868606,
+      "grad_norm": 0.09554693102836609,
+      "learning_rate": 2.5815504640151267e-05,
+      "loss": 0.0195,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5533400200601806,
+      "grad_norm": 0.26604852080345154,
+      "learning_rate": 2.512241712524185e-05,
+      "loss": 0.0277,
+      "step": 2420
+    },
+    {
+      "epoch": 1.5597592778335005,
+      "grad_norm": 0.08400601893663406,
+      "learning_rate": 2.443742166355695e-05,
+      "loss": 0.0267,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5661785356068205,
+      "grad_norm": 0.062076788395643234,
+      "learning_rate": 2.3760592282856565e-05,
+      "loss": 0.0209,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5725977933801403,
+      "grad_norm": 0.10021471977233887,
+      "learning_rate": 2.309200212838878e-05,
+      "loss": 0.0225,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5790170511534605,
+      "grad_norm": 0.06898768991231918,
+      "learning_rate": 2.2431723454984778e-05,
+      "loss": 0.0175,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5854363089267802,
+      "grad_norm": 0.06565247476100922,
+      "learning_rate": 2.1779827619250458e-05,
+      "loss": 0.019,
+      "step": 2470
+    },
+    {
+      "epoch": 1.5918555667001004,
+      "grad_norm": 0.10120698809623718,
+      "learning_rate": 2.1136385071854715e-05,
+      "loss": 0.0214,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5982748244734202,
+      "grad_norm": 0.03913561999797821,
+      "learning_rate": 2.050146534991587e-05,
+      "loss": 0.0214,
+      "step": 2490
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "grad_norm": 0.07003747671842575,
+      "learning_rate": 1.987513706948678e-05,
+      "loss": 0.0175,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6046940822467404,
+      "eval_loss": 0.04088287055492401,
+      "eval_runtime": 129.5987,
+      "eval_samples_per_second": 10.147,
+      "eval_steps_per_second": 10.147,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6111133400200601,
+      "grad_norm": 0.06662087887525558,
+      "learning_rate": 1.9257467918139428e-05,
+      "loss": 0.0212,
+      "step": 2510
+    },
+    {
+      "epoch": 1.61753259779338,
+      "grad_norm": 0.10407735407352448,
+      "learning_rate": 1.8648524647649925e-05,
+      "loss": 0.0227,
+      "step": 2520
+    },
+    {
+      "epoch": 1.6239518555667,
+      "grad_norm": 0.048161495476961136,
+      "learning_rate": 1.8048373066784575e-05,
+      "loss": 0.0193,
+      "step": 2530
+    },
+    {
+      "epoch": 1.63037111334002,
+      "grad_norm": 0.04323771968483925,
+      "learning_rate": 1.7457078034188068e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 1.63679037111334,
+      "grad_norm": 0.030361974611878395,
+      "learning_rate": 1.687470345137383e-05,
+      "loss": 0.0161,
+      "step": 2550
+    },
+    {
+      "epoch": 1.64320962888666,
+      "grad_norm": 0.064698725938797,
+      "learning_rate": 1.63013122558185e-05,
+      "loss": 0.0218,
+      "step": 2560
+    },
+    {
+      "epoch": 1.64962888665998,
+      "grad_norm": 0.04122168570756912,
+      "learning_rate": 1.5736966414160103e-05,
+      "loss": 0.0205,
+      "step": 2570
+    },
+    {
+      "epoch": 1.6560481444333,
+      "grad_norm": 0.0699082463979721,
+      "learning_rate": 1.5181726915501272e-05,
+      "loss": 0.0228,
+      "step": 2580
+    },
+    {
+      "epoch": 1.66246740220662,
+      "grad_norm": 0.08534803241491318,
+      "learning_rate": 1.4635653764818169e-05,
+      "loss": 0.0203,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6688866599799397,
+      "grad_norm": 0.04532073438167572,
+      "learning_rate": 1.4098805976475704e-05,
+      "loss": 0.0219,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6753059177532599,
+      "grad_norm": 0.06504928320646286,
+      "learning_rate": 1.3571241567849856e-05,
+      "loss": 0.0166,
+      "step": 2610
+    },
+    {
+      "epoch": 1.6817251755265796,
+      "grad_norm": 0.1507658064365387,
+      "learning_rate": 1.3053017553057656e-05,
+      "loss": 0.0217,
+      "step": 2620
+    },
+    {
+      "epoch": 1.6881444332998998,
+      "grad_norm": 0.05026421695947647,
+      "learning_rate": 1.2544189936795715e-05,
+      "loss": 0.0174,
+      "step": 2630
+    },
+    {
+      "epoch": 1.6945636910732196,
+      "grad_norm": 0.07838805764913559,
+      "learning_rate": 1.204481370828765e-05,
+      "loss": 0.0188,
+      "step": 2640
+    },
+    {
+      "epoch": 1.7009829488465398,
+      "grad_norm": 0.03739321231842041,
+      "learning_rate": 1.1554942835341565e-05,
+      "loss": 0.0162,
+      "step": 2650
+    },
+    {
+      "epoch": 1.7074022066198595,
+      "grad_norm": 0.07961956411600113,
+      "learning_rate": 1.1074630258517538e-05,
+      "loss": 0.0186,
+      "step": 2660
+    },
+    {
+      "epoch": 1.7138214643931795,
+      "grad_norm": 0.047265585511922836,
+      "learning_rate": 1.0603927885406451e-05,
+      "loss": 0.0211,
+      "step": 2670
+    },
+    {
+      "epoch": 1.7202407221664995,
+      "grad_norm": 0.08342643082141876,
+      "learning_rate": 1.0142886585020218e-05,
+      "loss": 0.0197,
+      "step": 2680
+    },
+    {
+      "epoch": 1.7266599799398195,
+      "grad_norm": 0.0401916429400444,
+      "learning_rate": 9.691556182294392e-06,
+      "loss": 0.0184,
+      "step": 2690
+    },
+    {
+      "epoch": 1.7330792377131394,
+      "grad_norm": 0.10155277699232101,
+      "learning_rate": 9.249985452703557e-06,
+      "loss": 0.0199,
+      "step": 2700
+    },
+    {
+      "epoch": 1.7394984954864594,
+      "grad_norm": 0.048305802047252655,
+      "learning_rate": 8.81822211699016e-06,
+      "loss": 0.0184,
+      "step": 2710
+    },
+    {
+      "epoch": 1.7459177532597794,
+      "grad_norm": 0.055666640400886536,
+      "learning_rate": 8.396312836007259e-06,
+      "loss": 0.018,
+      "step": 2720
+    },
+    {
+      "epoch": 1.7523370110330991,
+      "grad_norm": 0.08541178703308105,
+      "learning_rate": 7.984303205675924e-06,
+      "loss": 0.0233,
+      "step": 2730
+    },
+    {
+      "epoch": 1.7587562688064193,
+      "grad_norm": 0.07454168796539307,
+      "learning_rate": 7.582237752057608e-06,
+      "loss": 0.0207,
+      "step": 2740
+    },
+    {
+      "epoch": 1.765175526579739,
+      "grad_norm": 0.04274304583668709,
+      "learning_rate": 7.19015992654225e-06,
+      "loss": 0.0207,
+      "step": 2750
+    },
+    {
+      "epoch": 1.7715947843530593,
+      "grad_norm": 0.03931812569499016,
+      "learning_rate": 6.808112101152419e-06,
+      "loss": 0.0187,
+      "step": 2760
+    },
+    {
+      "epoch": 1.778014042126379,
+      "grad_norm": 0.04860474541783333,
+      "learning_rate": 6.436135563964196e-06,
+      "loss": 0.0208,
+      "step": 2770
+    },
+    {
+      "epoch": 1.7844332998996992,
+      "grad_norm": 0.05222581326961517,
+      "learning_rate": 6.074270514645109e-06,
+      "loss": 0.0241,
+      "step": 2780
+    },
+    {
+      "epoch": 1.790852557673019,
+      "grad_norm": 0.07665343582630157,
+      "learning_rate": 5.722556060109751e-06,
+      "loss": 0.0232,
+      "step": 2790
+    },
+    {
+      "epoch": 1.7972718154463392,
+      "grad_norm": 0.05370553582906723,
+      "learning_rate": 5.381030210293503e-06,
+      "loss": 0.0158,
+      "step": 2800
+    },
+    {
+      "epoch": 1.803691073219659,
+      "grad_norm": 0.036142729222774506,
+      "learning_rate": 5.049729874044762e-06,
+      "loss": 0.0172,
+      "step": 2810
+    },
+    {
+      "epoch": 1.810110330992979,
+      "grad_norm": 0.03157337009906769,
+      "learning_rate": 4.7286908551361755e-06,
+      "loss": 0.0181,
+      "step": 2820
+    },
+    {
+      "epoch": 1.8165295887662989,
+      "grad_norm": 0.04949560761451721,
+      "learning_rate": 4.417947848395332e-06,
+      "loss": 0.0189,
+      "step": 2830
+    },
+    {
+      "epoch": 1.8229488465396189,
+      "grad_norm": 0.07635015994310379,
+      "learning_rate": 4.117534435955261e-06,
+      "loss": 0.0214,
+      "step": 2840
+    },
+    {
+      "epoch": 1.8293681043129388,
+      "grad_norm": 0.028284436091780663,
+      "learning_rate": 3.827483083625238e-06,
+      "loss": 0.0202,
+      "step": 2850
+    },
+    {
+      "epoch": 1.8357873620862588,
+      "grad_norm": 0.07148082554340363,
+      "learning_rate": 3.5478251373821103e-06,
+      "loss": 0.0189,
+      "step": 2860
+    },
+    {
+      "epoch": 1.8422066198595788,
+      "grad_norm": 0.03399994224309921,
+      "learning_rate": 3.2785908199828073e-06,
+      "loss": 0.0148,
+      "step": 2870
+    },
+    {
+      "epoch": 1.8486258776328985,
+      "grad_norm": 0.05979405716061592,
+      "learning_rate": 3.0198092276981004e-06,
+      "loss": 0.0191,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8550451354062187,
+      "grad_norm": 0.03924340382218361,
+      "learning_rate": 2.7715083271681706e-06,
+      "loss": 0.02,
+      "step": 2890
+    },
+    {
+      "epoch": 1.8614643931795385,
+      "grad_norm": 0.05604667589068413,
+      "learning_rate": 2.5337149523802615e-06,
+      "loss": 0.0198,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8678836509528587,
+      "grad_norm": 0.050481703132390976,
+      "learning_rate": 2.306454801768676e-06,
+      "loss": 0.0183,
+      "step": 2910
+    },
+    {
+      "epoch": 1.8743029087261784,
+      "grad_norm": 0.05771298334002495,
+      "learning_rate": 2.0897524354375753e-06,
+      "loss": 0.0212,
+      "step": 2920
+    },
+    {
+      "epoch": 1.8807221664994986,
+      "grad_norm": 0.05019890144467354,
+      "learning_rate": 1.8836312725067474e-06,
+      "loss": 0.0162,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8871414242728184,
+      "grad_norm": 0.08003148436546326,
+      "learning_rate": 1.6881135885806753e-06,
+      "loss": 0.0251,
+      "step": 2940
+    },
+    {
+      "epoch": 1.8935606820461384,
+      "grad_norm": 0.04569955915212631,
+      "learning_rate": 1.5032205133412192e-06,
+      "loss": 0.0184,
+      "step": 2950
+    },
+    {
+      "epoch": 1.8999799398194583,
+      "grad_norm": 0.02558874897658825,
+      "learning_rate": 1.3289720282641306e-06,
+      "loss": 0.0194,
+      "step": 2960
+    },
+    {
+      "epoch": 1.9063991975927783,
+      "grad_norm": 0.03270651400089264,
+      "learning_rate": 1.1653869644596027e-06,
+      "loss": 0.0178,
+      "step": 2970
+    },
+    {
+      "epoch": 1.9128184553660983,
+      "grad_norm": 0.041952360421419144,
+      "learning_rate": 1.0124830006372432e-06,
+      "loss": 0.0226,
+      "step": 2980
+    },
+    {
+      "epoch": 1.9192377131394183,
+      "grad_norm": 0.02816896326839924,
+      "learning_rate": 8.702766611954793e-07,
+      "loss": 0.0154,
+      "step": 2990
+    },
+    {
+      "epoch": 1.9256569709127382,
+      "grad_norm": 0.04016564413905144,
+      "learning_rate": 7.387833144358092e-07,
+      "loss": 0.0181,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9256569709127382,
+      "eval_loss": 0.03883149474859238,
+      "eval_runtime": 128.8321,
+      "eval_samples_per_second": 10.207,
+      "eval_steps_per_second": 10.207,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9320762286860582,
+      "grad_norm": 0.04140494018793106,
+      "learning_rate": 6.180171709018967e-07,
+      "loss": 0.0185,
+      "step": 3010
+    },
+    {
+      "epoch": 1.9384954864593782,
+      "grad_norm": 0.05678441375494003,
+      "learning_rate": 5.079912818438115e-07,
+      "loss": 0.0166,
+      "step": 3020
+    },
+    {
+      "epoch": 1.944914744232698,
+      "grad_norm": 0.057847701013088226,
+      "learning_rate": 4.087175378076791e-07,
+      "loss": 0.0165,
+      "step": 3030
+    },
+    {
+      "epoch": 1.9513340020060181,
+      "grad_norm": 0.09393943846225739,
+      "learning_rate": 3.2020666735053105e-07,
+      "loss": 0.0214,
+      "step": 3040
+    },
+    {
+      "epoch": 1.957753259779338,
+      "grad_norm": 0.05397270992398262,
+      "learning_rate": 2.424682358809882e-07,
+      "loss": 0.0166,
+      "step": 3050
+    },
+    {
+      "epoch": 1.964172517552658,
+      "grad_norm": 0.09389074146747589,
+      "learning_rate": 1.7551064462543176e-07,
+      "loss": 0.0172,
+      "step": 3060
+    },
+    {
+      "epoch": 1.9705917753259778,
+      "grad_norm": 0.043656717985868454,
+      "learning_rate": 1.1934112972009638e-07,
+      "loss": 0.0178,
+      "step": 3070
+    },
+    {
+      "epoch": 1.977011033099298,
+      "grad_norm": 0.09749849885702133,
+      "learning_rate": 7.396576142912892e-08,
+      "loss": 0.0169,
+      "step": 3080
+    },
+    {
+      "epoch": 1.9834302908726178,
+      "grad_norm": 0.06255625933408737,
+      "learning_rate": 3.9389443488457856e-08,
+      "loss": 0.0209,
+      "step": 3090
+    },
+    {
+      "epoch": 1.9898495486459378,
+      "grad_norm": 0.052814945578575134,
+      "learning_rate": 1.5615912575928092e-08,
+      "loss": 0.0191,
+      "step": 3100
+    },
+    {
+      "epoch": 1.9962688064192577,
+      "grad_norm": 0.08855212479829788,
+      "learning_rate": 2.6477379074796305e-09,
+      "loss": 0.0184,
+      "step": 3110
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3116,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.712259894975867e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3116/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cac3f796d8e8ed78082f710e5c9ee0db63889906ddeefcc55981584c04b6c12
+size 5624

checkpoint-3116/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

final-adapter/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Coder-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Coder-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

final-adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

final-adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb453fefadc9b96c6f5f72e9b3c53ec46748149127236a2cdf80a24e561d676
+size 645975704

final-adapter/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}