Rohan1103 commited on Apr 25

Commit

e1c6894

verified ·

1 Parent(s): 1386560

Upload 30 files

Browse files

Files changed (31) hide show

.gitattributes +3 -0
sft_model/README.md +63 -0
sft_model/adapter_config.json +50 -0
sft_model/adapter_model.safetensors +3 -0
sft_model/chat_template.jinja +54 -0
sft_model/checkpoint-3000/README.md +210 -0
sft_model/checkpoint-3000/adapter_config.json +50 -0
sft_model/checkpoint-3000/adapter_model.safetensors +3 -0
sft_model/checkpoint-3000/chat_template.jinja +54 -0
sft_model/checkpoint-3000/optimizer.pt +3 -0
sft_model/checkpoint-3000/rng_state.pth +3 -0
sft_model/checkpoint-3000/scaler.pt +3 -0
sft_model/checkpoint-3000/scheduler.pt +3 -0
sft_model/checkpoint-3000/tokenizer.json +3 -0
sft_model/checkpoint-3000/tokenizer_config.json +16 -0
sft_model/checkpoint-3000/trainer_state.json +1084 -0
sft_model/checkpoint-3000/training_args.bin +3 -0
sft_model/checkpoint-3136/README.md +210 -0
sft_model/checkpoint-3136/adapter_config.json +50 -0
sft_model/checkpoint-3136/adapter_model.safetensors +3 -0
sft_model/checkpoint-3136/chat_template.jinja +54 -0
sft_model/checkpoint-3136/optimizer.pt +3 -0
sft_model/checkpoint-3136/rng_state.pth +3 -0
sft_model/checkpoint-3136/scaler.pt +3 -0
sft_model/checkpoint-3136/scheduler.pt +3 -0
sft_model/checkpoint-3136/tokenizer.json +3 -0
sft_model/checkpoint-3136/tokenizer_config.json +16 -0
sft_model/checkpoint-3136/trainer_state.json +1126 -0
sft_model/checkpoint-3136/training_args.bin +3 -0
sft_model/tokenizer.json +3 -0
sft_model/tokenizer_config.json +16 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sft_model/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+sft_model/checkpoint-3136/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+sft_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text

sft_model/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+library_name: peft
+model_name: sft_model
+tags:
+- base_model:adapter:/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+licence: license
+pipeline_tag: text-generation
+base_model: /kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+---
+# Model Card for sft_model
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.24.0
+- Transformers: 5.5.0
+- Pytorch: 2.10.0+cu128
+- Datasets: 4.3.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

sft_model/adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Qwen2ForCausalLM",
+    "parent_library": "transformers.models.qwen2.modeling_qwen2",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_model/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36a55b5b1902fef5f8c5417adb58fcf4e696995d7a06a9d85d073e1ddb311d3c
+size 161533192

sft_model/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

sft_model/checkpoint-3000/README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: /kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

sft_model/checkpoint-3000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Qwen2ForCausalLM",
+    "parent_library": "transformers.models.qwen2.modeling_qwen2",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_model/checkpoint-3000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1570db757213f113830d6aa39e45b2808150655e6088cdfbbe70861402dbad4
+size 161533192

sft_model/checkpoint-3000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

sft_model/checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec58fcd1bd0696414d0c4e9870f6fdf455982c92a70aef3ad4040b1634307aa7
+size 82465413

sft_model/checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c823fb2997dc4a132136df7944a54384a7523853cebf9d28a64aa89e6cb5cd
+size 14645

sft_model/checkpoint-3000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac1c46a2776d12775d23d0f587efc112188137ce2140da35bc15d301c9f620e
+size 1383

sft_model/checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f4c08c2146edf92228bb2698fd772a1ca948866ec1546b958b46c5606a2cad
+size 1465

sft_model/checkpoint-3000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b4360dd6a184650ffc48056c2569bc603f896c5adfe94b10f1c79f809638aa5
+size 11422166

sft_model/checkpoint-3000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [],
+  "is_local": true,
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

sft_model/checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1084 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9568614942987003,
+  "eval_steps": 500,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0063790766286580015,
+      "grad_norm": 1.1234201192855835,
+      "learning_rate": 2.4203821656050956e-05,
+      "loss": 2.6540979385375976,
+      "step": 20
+    },
+    {
+      "epoch": 0.012758153257316003,
+      "grad_norm": 0.7255190014839172,
+      "learning_rate": 4.968152866242039e-05,
+      "loss": 1.852321243286133,
+      "step": 40
+    },
+    {
+      "epoch": 0.019137229885974005,
+      "grad_norm": 0.4847199022769928,
+      "learning_rate": 7.515923566878981e-05,
+      "loss": 1.3090819358825683,
+      "step": 60
+    },
+    {
+      "epoch": 0.025516306514632006,
+      "grad_norm": 0.44303005933761597,
+      "learning_rate": 0.00010063694267515924,
+      "loss": 1.2170228004455566,
+      "step": 80
+    },
+    {
+      "epoch": 0.03189538314329001,
+      "grad_norm": 0.40238410234451294,
+      "learning_rate": 0.00012611464968152866,
+      "loss": 1.169914722442627,
+      "step": 100
+    },
+    {
+      "epoch": 0.03827445977194801,
+      "grad_norm": 0.4358486831188202,
+      "learning_rate": 0.0001515923566878981,
+      "loss": 1.1707877159118651,
+      "step": 120
+    },
+    {
+      "epoch": 0.044653536400606014,
+      "grad_norm": 0.5451267957687378,
+      "learning_rate": 0.00017707006369426754,
+      "loss": 1.1448563575744628,
+      "step": 140
+    },
+    {
+      "epoch": 0.05103261302926401,
+      "grad_norm": 0.44014137983322144,
+      "learning_rate": 0.00019999977757245233,
+      "loss": 1.1011881828308105,
+      "step": 160
+    },
+    {
+      "epoch": 0.05741168965792202,
+      "grad_norm": 0.4952339828014374,
+      "learning_rate": 0.00019997308746398385,
+      "loss": 1.1278093338012696,
+      "step": 180
+    },
+    {
+      "epoch": 0.06379076628658002,
+      "grad_norm": 0.48793825507164,
+      "learning_rate": 0.00019990192545034244,
+      "loss": 1.121494674682617,
+      "step": 200
+    },
+    {
+      "epoch": 0.07016984291523802,
+      "grad_norm": 0.36123648285865784,
+      "learning_rate": 0.00019978632318715074,
+      "loss": 1.1106701850891114,
+      "step": 220
+    },
+    {
+      "epoch": 0.07654891954389602,
+      "grad_norm": 0.3949609398841858,
+      "learning_rate": 0.0001996263320987772,
+      "loss": 1.112882137298584,
+      "step": 240
+    },
+    {
+      "epoch": 0.08292799617255402,
+      "grad_norm": 0.3579845130443573,
+      "learning_rate": 0.00019942202335546062,
+      "loss": 1.0800884246826172,
+      "step": 260
+    },
+    {
+      "epoch": 0.08930707280121203,
+      "grad_norm": 0.337467759847641,
+      "learning_rate": 0.00019917348784165092,
+      "loss": 1.1052613258361816,
+      "step": 280
+    },
+    {
+      "epoch": 0.09568614942987003,
+      "grad_norm": 0.35249248147010803,
+      "learning_rate": 0.00019888083611558014,
+      "loss": 1.0701614379882813,
+      "step": 300
+    },
+    {
+      "epoch": 0.10206522605852802,
+      "grad_norm": 0.3642044961452484,
+      "learning_rate": 0.0001985441983600819,
+      "loss": 1.061861228942871,
+      "step": 320
+    },
+    {
+      "epoch": 0.10844430268718604,
+      "grad_norm": 0.35655465722084045,
+      "learning_rate": 0.00019816372432468097,
+      "loss": 1.1088621139526367,
+      "step": 340
+    },
+    {
+      "epoch": 0.11482337931584403,
+      "grad_norm": 0.3401583135128021,
+      "learning_rate": 0.00019773958325897895,
+      "loss": 1.0811405181884766,
+      "step": 360
+    },
+    {
+      "epoch": 0.12120245594450203,
+      "grad_norm": 0.39485183358192444,
+      "learning_rate": 0.00019727196383736547,
+      "loss": 1.0966490745544433,
+      "step": 380
+    },
+    {
+      "epoch": 0.12758153257316004,
+      "grad_norm": 0.3158703148365021,
+      "learning_rate": 0.00019676107407508843,
+      "loss": 1.0605661392211914,
+      "step": 400
+    },
+    {
+      "epoch": 0.13396060920181804,
+      "grad_norm": 0.4091622531414032,
+      "learning_rate": 0.00019620714123572085,
+      "loss": 1.0958724975585938,
+      "step": 420
+    },
+    {
+      "epoch": 0.14033968583047604,
+      "grad_norm": 0.44412222504615784,
+      "learning_rate": 0.00019561041173006517,
+      "loss": 1.0952692031860352,
+      "step": 440
+    },
+    {
+      "epoch": 0.14671876245913404,
+      "grad_norm": 0.37837275862693787,
+      "learning_rate": 0.00019497115100654015,
+      "loss": 1.058570671081543,
+      "step": 460
+    },
+    {
+      "epoch": 0.15309783908779204,
+      "grad_norm": 0.3814297914505005,
+      "learning_rate": 0.00019428964343309922,
+      "loss": 1.0655178070068358,
+      "step": 480
+    },
+    {
+      "epoch": 0.15947691571645004,
+      "grad_norm": 0.4030211269855499,
+      "learning_rate": 0.00019356619217073253,
+      "loss": 1.0634085655212402,
+      "step": 500
+    },
+    {
+      "epoch": 0.16585599234510803,
+      "grad_norm": 0.38679832220077515,
+      "learning_rate": 0.00019280111903860912,
+      "loss": 1.0652976989746095,
+      "step": 520
+    },
+    {
+      "epoch": 0.17223506897376606,
+      "grad_norm": 0.3280096650123596,
+      "learning_rate": 0.00019199476437091933,
+      "loss": 1.0508193016052245,
+      "step": 540
+    },
+    {
+      "epoch": 0.17861414560242406,
+      "grad_norm": 0.3656080961227417,
+      "learning_rate": 0.0001911474868654811,
+      "loss": 1.0890400886535645,
+      "step": 560
+    },
+    {
+      "epoch": 0.18499322223108206,
+      "grad_norm": 0.3993516266345978,
+      "learning_rate": 0.00019025966342417697,
+      "loss": 1.0906559944152832,
+      "step": 580
+    },
+    {
+      "epoch": 0.19137229885974005,
+      "grad_norm": 0.36238643527030945,
+      "learning_rate": 0.00018933168898529383,
+      "loss": 1.0661024093627929,
+      "step": 600
+    },
+    {
+      "epoch": 0.19775137548839805,
+      "grad_norm": 0.40609851479530334,
+      "learning_rate": 0.00018836397634783883,
+      "loss": 1.0469740867614745,
+      "step": 620
+    },
+    {
+      "epoch": 0.20413045211705605,
+      "grad_norm": 0.44951748847961426,
+      "learning_rate": 0.00018735695598791046,
+      "loss": 1.0545502662658692,
+      "step": 640
+    },
+    {
+      "epoch": 0.21050952874571405,
+      "grad_norm": 0.3537753224372864,
+      "learning_rate": 0.00018631107586720614,
+      "loss": 1.0523859977722168,
+      "step": 660
+    },
+    {
+      "epoch": 0.21688860537437207,
+      "grad_norm": 0.37729203701019287,
+      "learning_rate": 0.0001852268012337514,
+      "loss": 1.0995121002197266,
+      "step": 680
+    },
+    {
+      "epoch": 0.22326768200303007,
+      "grad_norm": 0.35974422097206116,
+      "learning_rate": 0.00018410461441493956,
+      "loss": 1.0532832145690918,
+      "step": 700
+    },
+    {
+      "epoch": 0.22964675863168807,
+      "grad_norm": 0.5495746731758118,
+      "learning_rate": 0.00018294501460297386,
+      "loss": 1.071121883392334,
+      "step": 720
+    },
+    {
+      "epoch": 0.23602583526034607,
+      "grad_norm": 0.3257412016391754,
+      "learning_rate": 0.00018174851763280733,
+      "loss": 1.072437858581543,
+      "step": 740
+    },
+    {
+      "epoch": 0.24240491188900407,
+      "grad_norm": 0.4262811839580536,
+      "learning_rate": 0.00018051565575267939,
+      "loss": 1.052683162689209,
+      "step": 760
+    },
+    {
+      "epoch": 0.24878398851766206,
+      "grad_norm": 0.38114288449287415,
+      "learning_rate": 0.00017924697738735135,
+      "loss": 1.0581499099731446,
+      "step": 780
+    },
+    {
+      "epoch": 0.2551630651463201,
+      "grad_norm": 0.3934289515018463,
+      "learning_rate": 0.00017794304689414562,
+      "loss": 1.0319182395935058,
+      "step": 800
+    },
+    {
+      "epoch": 0.2615421417749781,
+      "grad_norm": 0.3929242789745331,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 1.0609262466430665,
+      "step": 820
+    },
+    {
+      "epoch": 0.2679212184036361,
+      "grad_norm": 0.3409107029438019,
+      "learning_rate": 0.000175231765102933,
+      "loss": 1.0357558250427246,
+      "step": 840
+    },
+    {
+      "epoch": 0.2743002950322941,
+      "grad_norm": 0.45465630292892456,
+      "learning_rate": 0.00017382561988818086,
+      "loss": 1.019899272918701,
+      "step": 860
+    },
+    {
+      "epoch": 0.2806793716609521,
+      "grad_norm": 0.31893905997276306,
+      "learning_rate": 0.00017238663417554797,
+      "loss": 1.0349628448486328,
+      "step": 880
+    },
+    {
+      "epoch": 0.2870584482896101,
+      "grad_norm": 0.47289255261421204,
+      "learning_rate": 0.00017091544808166747,
+      "loss": 1.0914591789245605,
+      "step": 900
+    },
+    {
+      "epoch": 0.2934375249182681,
+      "grad_norm": 0.4013218283653259,
+      "learning_rate": 0.00016941271604715058,
+      "loss": 1.025728416442871,
+      "step": 920
+    },
+    {
+      "epoch": 0.2998166015469261,
+      "grad_norm": 0.34149670600891113,
+      "learning_rate": 0.0001678791065454658,
+      "loss": 1.0540288925170898,
+      "step": 940
+    },
+    {
+      "epoch": 0.3061956781755841,
+      "grad_norm": 0.43255019187927246,
+      "learning_rate": 0.00016631530178557618,
+      "loss": 1.0435150146484375,
+      "step": 960
+    },
+    {
+      "epoch": 0.31257475480424207,
+      "grad_norm": 0.3473828136920929,
+      "learning_rate": 0.00016472199740846628,
+      "loss": 1.0774805068969726,
+      "step": 980
+    },
+    {
+      "epoch": 0.31895383143290007,
+      "grad_norm": 0.3513583838939667,
+      "learning_rate": 0.00016309990217769403,
+      "loss": 1.0494253158569335,
+      "step": 1000
+    },
+    {
+      "epoch": 0.32533290806155807,
+      "grad_norm": 0.35562664270401,
+      "learning_rate": 0.00016144973766410531,
+      "loss": 1.0458724975585938,
+      "step": 1020
+    },
+    {
+      "epoch": 0.33171198469021607,
+      "grad_norm": 0.35419169068336487,
+      "learning_rate": 0.00015977223792485118,
+      "loss": 1.0554842948913574,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3380910613188741,
+      "grad_norm": 0.3961597979068756,
+      "learning_rate": 0.00015806814917685084,
+      "loss": 1.035719108581543,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3444701379475321,
+      "grad_norm": 0.37739962339401245,
+      "learning_rate": 0.00015633822946484543,
+      "loss": 1.0724897384643555,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3508492145761901,
+      "grad_norm": 0.40744179487228394,
+      "learning_rate": 0.0001545832483241904,
+      "loss": 1.0812921524047852,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3572282912048481,
+      "grad_norm": 0.3863188922405243,
+      "learning_rate": 0.00015280398643853605,
+      "loss": 1.0662656784057618,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3636073678335061,
+      "grad_norm": 0.4145233929157257,
+      "learning_rate": 0.0001510012352925496,
+      "loss": 1.0171710968017578,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3699864444621641,
+      "grad_norm": 0.4319663941860199,
+      "learning_rate": 0.0001491757968198319,
+      "loss": 1.002194309234619,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3763655210908221,
+      "grad_norm": 0.370866596698761,
+      "learning_rate": 0.00014732848304618628,
+      "loss": 1.054680633544922,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3827445977194801,
+      "grad_norm": 0.39589911699295044,
+      "learning_rate": 0.0001454601157283979,
+      "loss": 1.051061248779297,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3891236743481381,
+      "grad_norm": 0.38640621304512024,
+      "learning_rate": 0.00014357152598868476,
+      "loss": 1.0340915679931642,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3955027509767961,
+      "grad_norm": 0.37239277362823486,
+      "learning_rate": 0.00014166355394498202,
+      "loss": 1.0854945182800293,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4018818276054541,
+      "grad_norm": 0.4776981472969055,
+      "learning_rate": 0.00013973704833722509,
+      "loss": 1.0340095520019532,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4082609042341121,
+      "grad_norm": 0.3754555284976959,
+      "learning_rate": 0.00013779286614979728,
+      "loss": 1.0588939666748047,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4146399808627701,
+      "grad_norm": 0.3581312596797943,
+      "learning_rate": 0.0001358318722303098,
+      "loss": 1.0207186698913575,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4210190574914281,
+      "grad_norm": 0.33835941553115845,
+      "learning_rate": 0.000133854938904884,
+      "loss": 0.996804141998291,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4273981341200861,
+      "grad_norm": 0.37163445353507996,
+      "learning_rate": 0.00013186294559010703,
+      "loss": 1.0360607147216796,
+      "step": 1340
+    },
+    {
+      "epoch": 0.43377721074874415,
+      "grad_norm": 0.40468406677246094,
+      "learning_rate": 0.0001298567784018332,
+      "loss": 1.0401429176330566,
+      "step": 1360
+    },
+    {
+      "epoch": 0.44015628737740214,
+      "grad_norm": 0.3982478380203247,
+      "learning_rate": 0.00012783732976100504,
+      "loss": 1.0549521446228027,
+      "step": 1380
+    },
+    {
+      "epoch": 0.44653536400606014,
+      "grad_norm": 0.3914712965488434,
+      "learning_rate": 0.00012580549799667034,
+      "loss": 1.025351333618164,
+      "step": 1400
+    },
+    {
+      "epoch": 0.45291444063471814,
+      "grad_norm": 0.36290040612220764,
+      "learning_rate": 0.00012376218694637028,
+      "loss": 1.0487945556640625,
+      "step": 1420
+    },
+    {
+      "epoch": 0.45929351726337614,
+      "grad_norm": 0.4272157549858093,
+      "learning_rate": 0.00012170830555407726,
+      "loss": 1.0303566932678223,
+      "step": 1440
+    },
+    {
+      "epoch": 0.46567259389203414,
+      "grad_norm": 0.4141615331172943,
+      "learning_rate": 0.00011964476746586187,
+      "loss": 1.0204460144042968,
+      "step": 1460
+    },
+    {
+      "epoch": 0.47205167052069213,
+      "grad_norm": 0.43568912148475647,
+      "learning_rate": 0.00011757249062346725,
+      "loss": 1.0501635551452637,
+      "step": 1480
+    },
+    {
+      "epoch": 0.47843074714935013,
+      "grad_norm": 0.4742281436920166,
+      "learning_rate": 0.00011549239685597327,
+      "loss": 1.0212038993835448,
+      "step": 1500
+    },
+    {
+      "epoch": 0.48480982377800813,
+      "grad_norm": 0.3556975722312927,
+      "learning_rate": 0.00011340541146973109,
+      "loss": 1.0127756118774414,
+      "step": 1520
+    },
+    {
+      "epoch": 0.49118890040666613,
+      "grad_norm": 0.3971082270145416,
+      "learning_rate": 0.0001113124628367512,
+      "loss": 1.0366864204406738,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4975679770353241,
+      "grad_norm": 0.38870203495025635,
+      "learning_rate": 0.00010921448198172721,
+      "loss": 1.0176129341125488,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5039470536639822,
+      "grad_norm": 0.36224520206451416,
+      "learning_rate": 0.00010711240216788036,
+      "loss": 0.9911076545715332,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5103261302926402,
+      "grad_norm": 0.398709237575531,
+      "learning_rate": 0.0001050071584818077,
+      "loss": 1.0506349563598634,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5167052069212982,
+      "grad_norm": 0.44569212198257446,
+      "learning_rate": 0.00010289968741751914,
+      "loss": 1.0399697303771973,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5230842835499562,
+      "grad_norm": 0.3576385974884033,
+      "learning_rate": 0.00010079092645984893,
+      "loss": 1.0107657432556152,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5294633601786142,
+      "grad_norm": 0.480198472738266,
+      "learning_rate": 9.868181366742589e-05,
+      "loss": 0.9988100051879882,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5358424368072722,
+      "grad_norm": 0.36921289563179016,
+      "learning_rate": 9.657328725538849e-05,
+      "loss": 1.0404596328735352,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5422215134359302,
+      "grad_norm": 0.3534800708293915,
+      "learning_rate": 9.446628517803055e-05,
+      "loss": 1.0561634063720704,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5486005900645882,
+      "grad_norm": 0.5924835205078125,
+      "learning_rate": 9.236174471156265e-05,
+      "loss": 1.0100406646728515,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5549796666932462,
+      "grad_norm": 0.41996270418167114,
+      "learning_rate": 9.026060203717553e-05,
+      "loss": 1.0402870178222656,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5613587433219042,
+      "grad_norm": 0.4084126949310303,
+      "learning_rate": 8.81637918245902e-05,
+      "loss": 1.0095555305480957,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5677378199505622,
+      "grad_norm": 0.3663918077945709,
+      "learning_rate": 8.607224681628068e-05,
+      "loss": 1.0506675720214844,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5741168965792202,
+      "grad_norm": 0.4715736210346222,
+      "learning_rate": 8.398689741255405e-05,
+      "loss": 1.0435371398925781,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5804959732078782,
+      "grad_norm": 0.43865320086479187,
+      "learning_rate": 8.190867125767228e-05,
+      "loss": 1.0446551322937012,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5868750498365362,
+      "grad_norm": 0.3950525224208832,
+      "learning_rate": 7.983849282720005e-05,
+      "loss": 1.0364575386047363,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5932541264651942,
+      "grad_norm": 0.38930827379226685,
+      "learning_rate": 7.777728301676215e-05,
+      "loss": 1.0290010452270508,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5996332030938522,
+      "grad_norm": 0.399541437625885,
+      "learning_rate": 7.572595873239367e-05,
+      "loss": 1.045849609375,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6060122797225101,
+      "grad_norm": 0.38800048828125,
+      "learning_rate": 7.368543248266436e-05,
+      "loss": 1.0078801155090331,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6123913563511681,
+      "grad_norm": 0.3803689479827881,
+      "learning_rate": 7.165661197275937e-05,
+      "loss": 0.9927312850952148,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6187704329798261,
+      "grad_norm": 0.3781611919403076,
+      "learning_rate": 6.964039970069723e-05,
+      "loss": 1.0288335800170898,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6251495096084841,
+      "grad_norm": 0.3648211658000946,
+      "learning_rate": 6.76376925558633e-05,
+      "loss": 1.016789150238037,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6315285862371421,
+      "grad_norm": 0.3644472062587738,
+      "learning_rate": 6.564938142003876e-05,
+      "loss": 1.0296217918395996,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6379076628658001,
+      "grad_norm": 0.4196849763393402,
+      "learning_rate": 6.367635077110193e-05,
+      "loss": 1.0465140342712402,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6442867394944581,
+      "grad_norm": 0.4229860305786133,
+      "learning_rate": 6.171947828957813e-05,
+      "loss": 0.9935624122619628,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6506658161231161,
+      "grad_norm": 0.3640349209308624,
+      "learning_rate": 5.97796344682134e-05,
+      "loss": 1.0277256965637207,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6570448927517741,
+      "grad_norm": 0.4113347828388214,
+      "learning_rate": 5.785768222474544e-05,
+      "loss": 1.0203803062438965,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6634239693804321,
+      "grad_norm": 0.36274972558021545,
+      "learning_rate": 5.595447651804462e-05,
+      "loss": 1.0556281089782715,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6698030460090901,
+      "grad_norm": 0.387104868888855,
+      "learning_rate": 5.4070863967794885e-05,
+      "loss": 0.9899411201477051,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6761821226377482,
+      "grad_norm": 0.38654327392578125,
+      "learning_rate": 5.220768247788458e-05,
+      "loss": 1.0453373908996582,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6825611992664062,
+      "grad_norm": 0.4250175356864929,
+      "learning_rate": 5.036576086367428e-05,
+      "loss": 1.014944362640381,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6889402758950642,
+      "grad_norm": 0.4817405343055725,
+      "learning_rate": 4.854591848330782e-05,
+      "loss": 0.9892327308654785,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6953193525237222,
+      "grad_norm": 0.4661904275417328,
+      "learning_rate": 4.6748964873229526e-05,
+      "loss": 1.0213812828063964,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7016984291523802,
+      "grad_norm": 0.42874079942703247,
+      "learning_rate": 4.49756993880715e-05,
+      "loss": 1.0591063499450684,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7080775057810382,
+      "grad_norm": 0.374489963054657,
+      "learning_rate": 4.322691084506956e-05,
+      "loss": 0.9852917671203614,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7144565824096962,
+      "grad_norm": 0.3870807886123657,
+      "learning_rate": 4.150337717316658e-05,
+      "loss": 0.9926732063293457,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7208356590383542,
+      "grad_norm": 0.3918297290802002,
+      "learning_rate": 3.9805865066959725e-05,
+      "loss": 1.018412208557129,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7272147356670122,
+      "grad_norm": 0.36913925409317017,
+      "learning_rate": 3.813512964564489e-05,
+      "loss": 1.0138113975524903,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7335938122956702,
+      "grad_norm": 0.42481136322021484,
+      "learning_rate": 3.64919141171104e-05,
+      "loss": 1.0364414215087892,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7399728889243282,
+      "grad_norm": 0.35376596450805664,
+      "learning_rate": 3.48769494473294e-05,
+      "loss": 1.0163302421569824,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7463519655529862,
+      "grad_norm": 0.3571137487888336,
+      "learning_rate": 3.329095403519776e-05,
+      "loss": 1.0149422645568849,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7527310421816442,
+      "grad_norm": 0.3296068012714386,
+      "learning_rate": 3.173463339296242e-05,
+      "loss": 1.0384585380554199,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7591101188103022,
+      "grad_norm": 0.39232197403907776,
+      "learning_rate": 3.0208679832382293e-05,
+      "loss": 0.979072380065918,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7654891954389602,
+      "grad_norm": 0.35901549458503723,
+      "learning_rate": 2.8713772156760966e-05,
+      "loss": 0.9918346405029297,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7718682720676182,
+      "grad_norm": 0.34417739510536194,
+      "learning_rate": 2.7250575358988817e-05,
+      "loss": 1.030968475341797,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7782473486962762,
+      "grad_norm": 0.40106430649757385,
+      "learning_rate": 2.581974032572836e-05,
+      "loss": 0.997005558013916,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7846264253249342,
+      "grad_norm": 0.40273308753967285,
+      "learning_rate": 2.4421903547874604e-05,
+      "loss": 1.0141830444335938,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7910055019535922,
+      "grad_norm": 0.36937829852104187,
+      "learning_rate": 2.3057686837419245e-05,
+      "loss": 0.9915987014770508,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7973845785822502,
+      "grad_norm": 0.41099807620048523,
+      "learning_rate": 2.1727697050844542e-05,
+      "loss": 1.0052967071533203,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8037636552109082,
+      "grad_norm": 0.3430964946746826,
+      "learning_rate": 2.04325258191702e-05,
+      "loss": 1.0124110221862792,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8101427318395662,
+      "grad_norm": 0.42573267221450806,
+      "learning_rate": 1.9172749284772617e-05,
+      "loss": 1.046116542816162,
+      "step": 2540
+    },
+    {
+      "epoch": 0.8165218084682242,
+      "grad_norm": 0.34176334738731384,
+      "learning_rate": 1.7948927845094743e-05,
+      "loss": 1.0312464714050293,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8229008850968822,
+      "grad_norm": 0.33181166648864746,
+      "learning_rate": 1.676160590335948e-05,
+      "loss": 0.97237548828125,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8292799617255402,
+      "grad_norm": 0.3751026391983032,
+      "learning_rate": 1.5611311626397908e-05,
+      "loss": 1.029274082183838,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8356590383541982,
+      "grad_norm": 0.3983386158943176,
+      "learning_rate": 1.4498556709700317e-05,
+      "loss": 0.979708480834961,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8420381149828562,
+      "grad_norm": 0.3683454692363739,
+      "learning_rate": 1.3423836149794189e-05,
+      "loss": 1.062849521636963,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8484171916115142,
+      "grad_norm": 0.43143871426582336,
+      "learning_rate": 1.2387628024050557e-05,
+      "loss": 1.0098539352416993,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8547962682401722,
+      "grad_norm": 0.43267059326171875,
+      "learning_rate": 1.139039327801661e-05,
+      "loss": 1.0019266128540039,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8611753448688302,
+      "grad_norm": 0.3967345952987671,
+      "learning_rate": 1.0432575520369293e-05,
+      "loss": 1.0030330657958983,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8675544214974883,
+      "grad_norm": 0.40890631079673767,
+      "learning_rate": 9.514600825581e-06,
+      "loss": 1.0057960510253907,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8739334981261463,
+      "grad_norm": 0.35522186756134033,
+      "learning_rate": 8.636877544385025e-06,
+      "loss": 1.045962429046631,
+      "step": 2740
+    },
+    {
+      "epoch": 0.8803125747548043,
+      "grad_norm": 0.3627678453922272,
+      "learning_rate": 7.799796122125414e-06,
+      "loss": 1.0400966644287108,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8866916513834623,
+      "grad_norm": 0.3707619905471802,
+      "learning_rate": 7.0037289250716846e-06,
+      "loss": 1.0188066482543945,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8930707280121203,
+      "grad_norm": 0.4088013172149658,
+      "learning_rate": 6.249030074775919e-06,
+      "loss": 0.9484004974365234,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8994498046407783,
+      "grad_norm": 0.456567645072937,
+      "learning_rate": 5.536035290545749e-06,
+      "loss": 0.9833850860595703,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9058288812694363,
+      "grad_norm": 0.34377098083496094,
+      "learning_rate": 4.865061740103361e-06,
+      "loss": 0.9835627555847168,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9122079578980943,
+      "grad_norm": 0.3588654696941376,
+      "learning_rate": 4.236407898497075e-06,
+      "loss": 1.0150874137878418,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9185870345267523,
+      "grad_norm": 0.424083411693573,
+      "learning_rate": 3.6503534153280007e-06,
+      "loss": 1.0256061553955078,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9249661111554103,
+      "grad_norm": 0.4053768217563629,
+      "learning_rate": 3.1071589903510335e-06,
+      "loss": 1.0478339195251465,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9313451877840683,
+      "grad_norm": 0.44554078578948975,
+      "learning_rate": 2.607066257505586e-06,
+      "loss": 1.00260648727417,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9377242644127263,
+      "grad_norm": 0.42239534854888916,
+      "learning_rate": 2.1502976774274043e-06,
+      "loss": 0.9838084220886231,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9441033410413843,
+      "grad_norm": 0.35276949405670166,
+      "learning_rate": 1.737056438489404e-06,
+      "loss": 0.997131633758545,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504824176700423,
+      "grad_norm": 0.47409260272979736,
+      "learning_rate": 1.3675263664156723e-06,
+      "loss": 0.9645838737487793,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568614942987003,
+      "grad_norm": 0.371419757604599,
+      "learning_rate": 1.0418718425086349e-06,
+      "loss": 1.0339078903198242,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 3136,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.6103893601918464e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

sft_model/checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1032b52eb31ad96bbd04920f8f0ac477c4a3f030fd4d7d093e600fd803f6dfce
+size 5713

sft_model/checkpoint-3136/README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: /kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

sft_model/checkpoint-3136/adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Qwen2ForCausalLM",
+    "parent_library": "transformers.models.qwen2.modeling_qwen2",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "/kaggle/input/models/qwen-lm/qwen2.5/transformers/7b-instruct/1",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sft_model/checkpoint-3136/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36a55b5b1902fef5f8c5417adb58fcf4e696995d7a06a9d85d073e1ddb311d3c
+size 161533192

sft_model/checkpoint-3136/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

sft_model/checkpoint-3136/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:988a4b597b4d919434a322593c8af4c75711cd252897089450edfe06aaa12c7f
+size 82465413

sft_model/checkpoint-3136/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eac9d6a7723b8080e7d2dc1c0ff83649f83165d6096ea52eb1cb216fb590868
+size 14645

sft_model/checkpoint-3136/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d05733eb7bc070e6eecaa8131a1ce3a8724e8274ec91cf9af98a633ae7bae86f
+size 1383

sft_model/checkpoint-3136/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b20fb807a0e77feae4fa22241ef880582593039afd1a186c63e481c4939915aa
+size 1465

sft_model/checkpoint-3136/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b4360dd6a184650ffc48056c2569bc603f896c5adfe94b10f1c79f809638aa5
+size 11422166

sft_model/checkpoint-3136/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [],
+  "is_local": true,
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

sft_model/checkpoint-3136/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1126 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3136,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0063790766286580015,
+      "grad_norm": 1.1234201192855835,
+      "learning_rate": 2.4203821656050956e-05,
+      "loss": 2.6540979385375976,
+      "step": 20
+    },
+    {
+      "epoch": 0.012758153257316003,
+      "grad_norm": 0.7255190014839172,
+      "learning_rate": 4.968152866242039e-05,
+      "loss": 1.852321243286133,
+      "step": 40
+    },
+    {
+      "epoch": 0.019137229885974005,
+      "grad_norm": 0.4847199022769928,
+      "learning_rate": 7.515923566878981e-05,
+      "loss": 1.3090819358825683,
+      "step": 60
+    },
+    {
+      "epoch": 0.025516306514632006,
+      "grad_norm": 0.44303005933761597,
+      "learning_rate": 0.00010063694267515924,
+      "loss": 1.2170228004455566,
+      "step": 80
+    },
+    {
+      "epoch": 0.03189538314329001,
+      "grad_norm": 0.40238410234451294,
+      "learning_rate": 0.00012611464968152866,
+      "loss": 1.169914722442627,
+      "step": 100
+    },
+    {
+      "epoch": 0.03827445977194801,
+      "grad_norm": 0.4358486831188202,
+      "learning_rate": 0.0001515923566878981,
+      "loss": 1.1707877159118651,
+      "step": 120
+    },
+    {
+      "epoch": 0.044653536400606014,
+      "grad_norm": 0.5451267957687378,
+      "learning_rate": 0.00017707006369426754,
+      "loss": 1.1448563575744628,
+      "step": 140
+    },
+    {
+      "epoch": 0.05103261302926401,
+      "grad_norm": 0.44014137983322144,
+      "learning_rate": 0.00019999977757245233,
+      "loss": 1.1011881828308105,
+      "step": 160
+    },
+    {
+      "epoch": 0.05741168965792202,
+      "grad_norm": 0.4952339828014374,
+      "learning_rate": 0.00019997308746398385,
+      "loss": 1.1278093338012696,
+      "step": 180
+    },
+    {
+      "epoch": 0.06379076628658002,
+      "grad_norm": 0.48793825507164,
+      "learning_rate": 0.00019990192545034244,
+      "loss": 1.121494674682617,
+      "step": 200
+    },
+    {
+      "epoch": 0.07016984291523802,
+      "grad_norm": 0.36123648285865784,
+      "learning_rate": 0.00019978632318715074,
+      "loss": 1.1106701850891114,
+      "step": 220
+    },
+    {
+      "epoch": 0.07654891954389602,
+      "grad_norm": 0.3949609398841858,
+      "learning_rate": 0.0001996263320987772,
+      "loss": 1.112882137298584,
+      "step": 240
+    },
+    {
+      "epoch": 0.08292799617255402,
+      "grad_norm": 0.3579845130443573,
+      "learning_rate": 0.00019942202335546062,
+      "loss": 1.0800884246826172,
+      "step": 260
+    },
+    {
+      "epoch": 0.08930707280121203,
+      "grad_norm": 0.337467759847641,
+      "learning_rate": 0.00019917348784165092,
+      "loss": 1.1052613258361816,
+      "step": 280
+    },
+    {
+      "epoch": 0.09568614942987003,
+      "grad_norm": 0.35249248147010803,
+      "learning_rate": 0.00019888083611558014,
+      "loss": 1.0701614379882813,
+      "step": 300
+    },
+    {
+      "epoch": 0.10206522605852802,
+      "grad_norm": 0.3642044961452484,
+      "learning_rate": 0.0001985441983600819,
+      "loss": 1.061861228942871,
+      "step": 320
+    },
+    {
+      "epoch": 0.10844430268718604,
+      "grad_norm": 0.35655465722084045,
+      "learning_rate": 0.00019816372432468097,
+      "loss": 1.1088621139526367,
+      "step": 340
+    },
+    {
+      "epoch": 0.11482337931584403,
+      "grad_norm": 0.3401583135128021,
+      "learning_rate": 0.00019773958325897895,
+      "loss": 1.0811405181884766,
+      "step": 360
+    },
+    {
+      "epoch": 0.12120245594450203,
+      "grad_norm": 0.39485183358192444,
+      "learning_rate": 0.00019727196383736547,
+      "loss": 1.0966490745544433,
+      "step": 380
+    },
+    {
+      "epoch": 0.12758153257316004,
+      "grad_norm": 0.3158703148365021,
+      "learning_rate": 0.00019676107407508843,
+      "loss": 1.0605661392211914,
+      "step": 400
+    },
+    {
+      "epoch": 0.13396060920181804,
+      "grad_norm": 0.4091622531414032,
+      "learning_rate": 0.00019620714123572085,
+      "loss": 1.0958724975585938,
+      "step": 420
+    },
+    {
+      "epoch": 0.14033968583047604,
+      "grad_norm": 0.44412222504615784,
+      "learning_rate": 0.00019561041173006517,
+      "loss": 1.0952692031860352,
+      "step": 440
+    },
+    {
+      "epoch": 0.14671876245913404,
+      "grad_norm": 0.37837275862693787,
+      "learning_rate": 0.00019497115100654015,
+      "loss": 1.058570671081543,
+      "step": 460
+    },
+    {
+      "epoch": 0.15309783908779204,
+      "grad_norm": 0.3814297914505005,
+      "learning_rate": 0.00019428964343309922,
+      "loss": 1.0655178070068358,
+      "step": 480
+    },
+    {
+      "epoch": 0.15947691571645004,
+      "grad_norm": 0.4030211269855499,
+      "learning_rate": 0.00019356619217073253,
+      "loss": 1.0634085655212402,
+      "step": 500
+    },
+    {
+      "epoch": 0.16585599234510803,
+      "grad_norm": 0.38679832220077515,
+      "learning_rate": 0.00019280111903860912,
+      "loss": 1.0652976989746095,
+      "step": 520
+    },
+    {
+      "epoch": 0.17223506897376606,
+      "grad_norm": 0.3280096650123596,
+      "learning_rate": 0.00019199476437091933,
+      "loss": 1.0508193016052245,
+      "step": 540
+    },
+    {
+      "epoch": 0.17861414560242406,
+      "grad_norm": 0.3656080961227417,
+      "learning_rate": 0.0001911474868654811,
+      "loss": 1.0890400886535645,
+      "step": 560
+    },
+    {
+      "epoch": 0.18499322223108206,
+      "grad_norm": 0.3993516266345978,
+      "learning_rate": 0.00019025966342417697,
+      "loss": 1.0906559944152832,
+      "step": 580
+    },
+    {
+      "epoch": 0.19137229885974005,
+      "grad_norm": 0.36238643527030945,
+      "learning_rate": 0.00018933168898529383,
+      "loss": 1.0661024093627929,
+      "step": 600
+    },
+    {
+      "epoch": 0.19775137548839805,
+      "grad_norm": 0.40609851479530334,
+      "learning_rate": 0.00018836397634783883,
+      "loss": 1.0469740867614745,
+      "step": 620
+    },
+    {
+      "epoch": 0.20413045211705605,
+      "grad_norm": 0.44951748847961426,
+      "learning_rate": 0.00018735695598791046,
+      "loss": 1.0545502662658692,
+      "step": 640
+    },
+    {
+      "epoch": 0.21050952874571405,
+      "grad_norm": 0.3537753224372864,
+      "learning_rate": 0.00018631107586720614,
+      "loss": 1.0523859977722168,
+      "step": 660
+    },
+    {
+      "epoch": 0.21688860537437207,
+      "grad_norm": 0.37729203701019287,
+      "learning_rate": 0.0001852268012337514,
+      "loss": 1.0995121002197266,
+      "step": 680
+    },
+    {
+      "epoch": 0.22326768200303007,
+      "grad_norm": 0.35974422097206116,
+      "learning_rate": 0.00018410461441493956,
+      "loss": 1.0532832145690918,
+      "step": 700
+    },
+    {
+      "epoch": 0.22964675863168807,
+      "grad_norm": 0.5495746731758118,
+      "learning_rate": 0.00018294501460297386,
+      "loss": 1.071121883392334,
+      "step": 720
+    },
+    {
+      "epoch": 0.23602583526034607,
+      "grad_norm": 0.3257412016391754,
+      "learning_rate": 0.00018174851763280733,
+      "loss": 1.072437858581543,
+      "step": 740
+    },
+    {
+      "epoch": 0.24240491188900407,
+      "grad_norm": 0.4262811839580536,
+      "learning_rate": 0.00018051565575267939,
+      "loss": 1.052683162689209,
+      "step": 760
+    },
+    {
+      "epoch": 0.24878398851766206,
+      "grad_norm": 0.38114288449287415,
+      "learning_rate": 0.00017924697738735135,
+      "loss": 1.0581499099731446,
+      "step": 780
+    },
+    {
+      "epoch": 0.2551630651463201,
+      "grad_norm": 0.3934289515018463,
+      "learning_rate": 0.00017794304689414562,
+      "loss": 1.0319182395935058,
+      "step": 800
+    },
+    {
+      "epoch": 0.2615421417749781,
+      "grad_norm": 0.3929242789745331,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 1.0609262466430665,
+      "step": 820
+    },
+    {
+      "epoch": 0.2679212184036361,
+      "grad_norm": 0.3409107029438019,
+      "learning_rate": 0.000175231765102933,
+      "loss": 1.0357558250427246,
+      "step": 840
+    },
+    {
+      "epoch": 0.2743002950322941,
+      "grad_norm": 0.45465630292892456,
+      "learning_rate": 0.00017382561988818086,
+      "loss": 1.019899272918701,
+      "step": 860
+    },
+    {
+      "epoch": 0.2806793716609521,
+      "grad_norm": 0.31893905997276306,
+      "learning_rate": 0.00017238663417554797,
+      "loss": 1.0349628448486328,
+      "step": 880
+    },
+    {
+      "epoch": 0.2870584482896101,
+      "grad_norm": 0.47289255261421204,
+      "learning_rate": 0.00017091544808166747,
+      "loss": 1.0914591789245605,
+      "step": 900
+    },
+    {
+      "epoch": 0.2934375249182681,
+      "grad_norm": 0.4013218283653259,
+      "learning_rate": 0.00016941271604715058,
+      "loss": 1.025728416442871,
+      "step": 920
+    },
+    {
+      "epoch": 0.2998166015469261,
+      "grad_norm": 0.34149670600891113,
+      "learning_rate": 0.0001678791065454658,
+      "loss": 1.0540288925170898,
+      "step": 940
+    },
+    {
+      "epoch": 0.3061956781755841,
+      "grad_norm": 0.43255019187927246,
+      "learning_rate": 0.00016631530178557618,
+      "loss": 1.0435150146484375,
+      "step": 960
+    },
+    {
+      "epoch": 0.31257475480424207,
+      "grad_norm": 0.3473828136920929,
+      "learning_rate": 0.00016472199740846628,
+      "loss": 1.0774805068969726,
+      "step": 980
+    },
+    {
+      "epoch": 0.31895383143290007,
+      "grad_norm": 0.3513583838939667,
+      "learning_rate": 0.00016309990217769403,
+      "loss": 1.0494253158569335,
+      "step": 1000
+    },
+    {
+      "epoch": 0.32533290806155807,
+      "grad_norm": 0.35562664270401,
+      "learning_rate": 0.00016144973766410531,
+      "loss": 1.0458724975585938,
+      "step": 1020
+    },
+    {
+      "epoch": 0.33171198469021607,
+      "grad_norm": 0.35419169068336487,
+      "learning_rate": 0.00015977223792485118,
+      "loss": 1.0554842948913574,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3380910613188741,
+      "grad_norm": 0.3961597979068756,
+      "learning_rate": 0.00015806814917685084,
+      "loss": 1.035719108581543,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3444701379475321,
+      "grad_norm": 0.37739962339401245,
+      "learning_rate": 0.00015633822946484543,
+      "loss": 1.0724897384643555,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3508492145761901,
+      "grad_norm": 0.40744179487228394,
+      "learning_rate": 0.0001545832483241904,
+      "loss": 1.0812921524047852,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3572282912048481,
+      "grad_norm": 0.3863188922405243,
+      "learning_rate": 0.00015280398643853605,
+      "loss": 1.0662656784057618,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3636073678335061,
+      "grad_norm": 0.4145233929157257,
+      "learning_rate": 0.0001510012352925496,
+      "loss": 1.0171710968017578,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3699864444621641,
+      "grad_norm": 0.4319663941860199,
+      "learning_rate": 0.0001491757968198319,
+      "loss": 1.002194309234619,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3763655210908221,
+      "grad_norm": 0.370866596698761,
+      "learning_rate": 0.00014732848304618628,
+      "loss": 1.054680633544922,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3827445977194801,
+      "grad_norm": 0.39589911699295044,
+      "learning_rate": 0.0001454601157283979,
+      "loss": 1.051061248779297,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3891236743481381,
+      "grad_norm": 0.38640621304512024,
+      "learning_rate": 0.00014357152598868476,
+      "loss": 1.0340915679931642,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3955027509767961,
+      "grad_norm": 0.37239277362823486,
+      "learning_rate": 0.00014166355394498202,
+      "loss": 1.0854945182800293,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4018818276054541,
+      "grad_norm": 0.4776981472969055,
+      "learning_rate": 0.00013973704833722509,
+      "loss": 1.0340095520019532,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4082609042341121,
+      "grad_norm": 0.3754555284976959,
+      "learning_rate": 0.00013779286614979728,
+      "loss": 1.0588939666748047,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4146399808627701,
+      "grad_norm": 0.3581312596797943,
+      "learning_rate": 0.0001358318722303098,
+      "loss": 1.0207186698913575,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4210190574914281,
+      "grad_norm": 0.33835941553115845,
+      "learning_rate": 0.000133854938904884,
+      "loss": 0.996804141998291,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4273981341200861,
+      "grad_norm": 0.37163445353507996,
+      "learning_rate": 0.00013186294559010703,
+      "loss": 1.0360607147216796,
+      "step": 1340
+    },
+    {
+      "epoch": 0.43377721074874415,
+      "grad_norm": 0.40468406677246094,
+      "learning_rate": 0.0001298567784018332,
+      "loss": 1.0401429176330566,
+      "step": 1360
+    },
+    {
+      "epoch": 0.44015628737740214,
+      "grad_norm": 0.3982478380203247,
+      "learning_rate": 0.00012783732976100504,
+      "loss": 1.0549521446228027,
+      "step": 1380
+    },
+    {
+      "epoch": 0.44653536400606014,
+      "grad_norm": 0.3914712965488434,
+      "learning_rate": 0.00012580549799667034,
+      "loss": 1.025351333618164,
+      "step": 1400
+    },
+    {
+      "epoch": 0.45291444063471814,
+      "grad_norm": 0.36290040612220764,
+      "learning_rate": 0.00012376218694637028,
+      "loss": 1.0487945556640625,
+      "step": 1420
+    },
+    {
+      "epoch": 0.45929351726337614,
+      "grad_norm": 0.4272157549858093,
+      "learning_rate": 0.00012170830555407726,
+      "loss": 1.0303566932678223,
+      "step": 1440
+    },
+    {
+      "epoch": 0.46567259389203414,
+      "grad_norm": 0.4141615331172943,
+      "learning_rate": 0.00011964476746586187,
+      "loss": 1.0204460144042968,
+      "step": 1460
+    },
+    {
+      "epoch": 0.47205167052069213,
+      "grad_norm": 0.43568912148475647,
+      "learning_rate": 0.00011757249062346725,
+      "loss": 1.0501635551452637,
+      "step": 1480
+    },
+    {
+      "epoch": 0.47843074714935013,
+      "grad_norm": 0.4742281436920166,
+      "learning_rate": 0.00011549239685597327,
+      "loss": 1.0212038993835448,
+      "step": 1500
+    },
+    {
+      "epoch": 0.48480982377800813,
+      "grad_norm": 0.3556975722312927,
+      "learning_rate": 0.00011340541146973109,
+      "loss": 1.0127756118774414,
+      "step": 1520
+    },
+    {
+      "epoch": 0.49118890040666613,
+      "grad_norm": 0.3971082270145416,
+      "learning_rate": 0.0001113124628367512,
+      "loss": 1.0366864204406738,
+      "step": 1540
+    },
+    {
+      "epoch": 0.4975679770353241,
+      "grad_norm": 0.38870203495025635,
+      "learning_rate": 0.00010921448198172721,
+      "loss": 1.0176129341125488,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5039470536639822,
+      "grad_norm": 0.36224520206451416,
+      "learning_rate": 0.00010711240216788036,
+      "loss": 0.9911076545715332,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5103261302926402,
+      "grad_norm": 0.398709237575531,
+      "learning_rate": 0.0001050071584818077,
+      "loss": 1.0506349563598634,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5167052069212982,
+      "grad_norm": 0.44569212198257446,
+      "learning_rate": 0.00010289968741751914,
+      "loss": 1.0399697303771973,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5230842835499562,
+      "grad_norm": 0.3576385974884033,
+      "learning_rate": 0.00010079092645984893,
+      "loss": 1.0107657432556152,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5294633601786142,
+      "grad_norm": 0.480198472738266,
+      "learning_rate": 9.868181366742589e-05,
+      "loss": 0.9988100051879882,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5358424368072722,
+      "grad_norm": 0.36921289563179016,
+      "learning_rate": 9.657328725538849e-05,
+      "loss": 1.0404596328735352,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5422215134359302,
+      "grad_norm": 0.3534800708293915,
+      "learning_rate": 9.446628517803055e-05,
+      "loss": 1.0561634063720704,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5486005900645882,
+      "grad_norm": 0.5924835205078125,
+      "learning_rate": 9.236174471156265e-05,
+      "loss": 1.0100406646728515,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5549796666932462,
+      "grad_norm": 0.41996270418167114,
+      "learning_rate": 9.026060203717553e-05,
+      "loss": 1.0402870178222656,
+      "step": 1740
+    },
+    {
+      "epoch": 0.5613587433219042,
+      "grad_norm": 0.4084126949310303,
+      "learning_rate": 8.81637918245902e-05,
+      "loss": 1.0095555305480957,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5677378199505622,
+      "grad_norm": 0.3663918077945709,
+      "learning_rate": 8.607224681628068e-05,
+      "loss": 1.0506675720214844,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5741168965792202,
+      "grad_norm": 0.4715736210346222,
+      "learning_rate": 8.398689741255405e-05,
+      "loss": 1.0435371398925781,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5804959732078782,
+      "grad_norm": 0.43865320086479187,
+      "learning_rate": 8.190867125767228e-05,
+      "loss": 1.0446551322937012,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5868750498365362,
+      "grad_norm": 0.3950525224208832,
+      "learning_rate": 7.983849282720005e-05,
+      "loss": 1.0364575386047363,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5932541264651942,
+      "grad_norm": 0.38930827379226685,
+      "learning_rate": 7.777728301676215e-05,
+      "loss": 1.0290010452270508,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5996332030938522,
+      "grad_norm": 0.399541437625885,
+      "learning_rate": 7.572595873239367e-05,
+      "loss": 1.045849609375,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6060122797225101,
+      "grad_norm": 0.38800048828125,
+      "learning_rate": 7.368543248266436e-05,
+      "loss": 1.0078801155090331,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6123913563511681,
+      "grad_norm": 0.3803689479827881,
+      "learning_rate": 7.165661197275937e-05,
+      "loss": 0.9927312850952148,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6187704329798261,
+      "grad_norm": 0.3781611919403076,
+      "learning_rate": 6.964039970069723e-05,
+      "loss": 1.0288335800170898,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6251495096084841,
+      "grad_norm": 0.3648211658000946,
+      "learning_rate": 6.76376925558633e-05,
+      "loss": 1.016789150238037,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6315285862371421,
+      "grad_norm": 0.3644472062587738,
+      "learning_rate": 6.564938142003876e-05,
+      "loss": 1.0296217918395996,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6379076628658001,
+      "grad_norm": 0.4196849763393402,
+      "learning_rate": 6.367635077110193e-05,
+      "loss": 1.0465140342712402,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6442867394944581,
+      "grad_norm": 0.4229860305786133,
+      "learning_rate": 6.171947828957813e-05,
+      "loss": 0.9935624122619628,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6506658161231161,
+      "grad_norm": 0.3640349209308624,
+      "learning_rate": 5.97796344682134e-05,
+      "loss": 1.0277256965637207,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6570448927517741,
+      "grad_norm": 0.4113347828388214,
+      "learning_rate": 5.785768222474544e-05,
+      "loss": 1.0203803062438965,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6634239693804321,
+      "grad_norm": 0.36274972558021545,
+      "learning_rate": 5.595447651804462e-05,
+      "loss": 1.0556281089782715,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6698030460090901,
+      "grad_norm": 0.387104868888855,
+      "learning_rate": 5.4070863967794885e-05,
+      "loss": 0.9899411201477051,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6761821226377482,
+      "grad_norm": 0.38654327392578125,
+      "learning_rate": 5.220768247788458e-05,
+      "loss": 1.0453373908996582,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6825611992664062,
+      "grad_norm": 0.4250175356864929,
+      "learning_rate": 5.036576086367428e-05,
+      "loss": 1.014944362640381,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6889402758950642,
+      "grad_norm": 0.4817405343055725,
+      "learning_rate": 4.854591848330782e-05,
+      "loss": 0.9892327308654785,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6953193525237222,
+      "grad_norm": 0.4661904275417328,
+      "learning_rate": 4.6748964873229526e-05,
+      "loss": 1.0213812828063964,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7016984291523802,
+      "grad_norm": 0.42874079942703247,
+      "learning_rate": 4.49756993880715e-05,
+      "loss": 1.0591063499450684,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7080775057810382,
+      "grad_norm": 0.374489963054657,
+      "learning_rate": 4.322691084506956e-05,
+      "loss": 0.9852917671203614,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7144565824096962,
+      "grad_norm": 0.3870807886123657,
+      "learning_rate": 4.150337717316658e-05,
+      "loss": 0.9926732063293457,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7208356590383542,
+      "grad_norm": 0.3918297290802002,
+      "learning_rate": 3.9805865066959725e-05,
+      "loss": 1.018412208557129,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7272147356670122,
+      "grad_norm": 0.36913925409317017,
+      "learning_rate": 3.813512964564489e-05,
+      "loss": 1.0138113975524903,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7335938122956702,
+      "grad_norm": 0.42481136322021484,
+      "learning_rate": 3.64919141171104e-05,
+      "loss": 1.0364414215087892,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7399728889243282,
+      "grad_norm": 0.35376596450805664,
+      "learning_rate": 3.48769494473294e-05,
+      "loss": 1.0163302421569824,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7463519655529862,
+      "grad_norm": 0.3571137487888336,
+      "learning_rate": 3.329095403519776e-05,
+      "loss": 1.0149422645568849,
+      "step": 2340
+    },
+    {
+      "epoch": 0.7527310421816442,
+      "grad_norm": 0.3296068012714386,
+      "learning_rate": 3.173463339296242e-05,
+      "loss": 1.0384585380554199,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7591101188103022,
+      "grad_norm": 0.39232197403907776,
+      "learning_rate": 3.0208679832382293e-05,
+      "loss": 0.979072380065918,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7654891954389602,
+      "grad_norm": 0.35901549458503723,
+      "learning_rate": 2.8713772156760966e-05,
+      "loss": 0.9918346405029297,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7718682720676182,
+      "grad_norm": 0.34417739510536194,
+      "learning_rate": 2.7250575358988817e-05,
+      "loss": 1.030968475341797,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7782473486962762,
+      "grad_norm": 0.40106430649757385,
+      "learning_rate": 2.581974032572836e-05,
+      "loss": 0.997005558013916,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7846264253249342,
+      "grad_norm": 0.40273308753967285,
+      "learning_rate": 2.4421903547874604e-05,
+      "loss": 1.0141830444335938,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7910055019535922,
+      "grad_norm": 0.36937829852104187,
+      "learning_rate": 2.3057686837419245e-05,
+      "loss": 0.9915987014770508,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7973845785822502,
+      "grad_norm": 0.41099807620048523,
+      "learning_rate": 2.1727697050844542e-05,
+      "loss": 1.0052967071533203,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8037636552109082,
+      "grad_norm": 0.3430964946746826,
+      "learning_rate": 2.04325258191702e-05,
+      "loss": 1.0124110221862792,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8101427318395662,
+      "grad_norm": 0.42573267221450806,
+      "learning_rate": 1.9172749284772617e-05,
+      "loss": 1.046116542816162,
+      "step": 2540
+    },
+    {
+      "epoch": 0.8165218084682242,
+      "grad_norm": 0.34176334738731384,
+      "learning_rate": 1.7948927845094743e-05,
+      "loss": 1.0312464714050293,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8229008850968822,
+      "grad_norm": 0.33181166648864746,
+      "learning_rate": 1.676160590335948e-05,
+      "loss": 0.97237548828125,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8292799617255402,
+      "grad_norm": 0.3751026391983032,
+      "learning_rate": 1.5611311626397908e-05,
+      "loss": 1.029274082183838,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8356590383541982,
+      "grad_norm": 0.3983386158943176,
+      "learning_rate": 1.4498556709700317e-05,
+      "loss": 0.979708480834961,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8420381149828562,
+      "grad_norm": 0.3683454692363739,
+      "learning_rate": 1.3423836149794189e-05,
+      "loss": 1.062849521636963,
+      "step": 2640
+    },
+    {
+      "epoch": 0.8484171916115142,
+      "grad_norm": 0.43143871426582336,
+      "learning_rate": 1.2387628024050557e-05,
+      "loss": 1.0098539352416993,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8547962682401722,
+      "grad_norm": 0.43267059326171875,
+      "learning_rate": 1.139039327801661e-05,
+      "loss": 1.0019266128540039,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8611753448688302,
+      "grad_norm": 0.3967345952987671,
+      "learning_rate": 1.0432575520369293e-05,
+      "loss": 1.0030330657958983,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8675544214974883,
+      "grad_norm": 0.40890631079673767,
+      "learning_rate": 9.514600825581e-06,
+      "loss": 1.0057960510253907,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8739334981261463,
+      "grad_norm": 0.35522186756134033,
+      "learning_rate": 8.636877544385025e-06,
+      "loss": 1.045962429046631,
+      "step": 2740
+    },
+    {
+      "epoch": 0.8803125747548043,
+      "grad_norm": 0.3627678453922272,
+      "learning_rate": 7.799796122125414e-06,
+      "loss": 1.0400966644287108,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8866916513834623,
+      "grad_norm": 0.3707619905471802,
+      "learning_rate": 7.0037289250716846e-06,
+      "loss": 1.0188066482543945,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8930707280121203,
+      "grad_norm": 0.4088013172149658,
+      "learning_rate": 6.249030074775919e-06,
+      "loss": 0.9484004974365234,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8994498046407783,
+      "grad_norm": 0.456567645072937,
+      "learning_rate": 5.536035290545749e-06,
+      "loss": 0.9833850860595703,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9058288812694363,
+      "grad_norm": 0.34377098083496094,
+      "learning_rate": 4.865061740103361e-06,
+      "loss": 0.9835627555847168,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9122079578980943,
+      "grad_norm": 0.3588654696941376,
+      "learning_rate": 4.236407898497075e-06,
+      "loss": 1.0150874137878418,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9185870345267523,
+      "grad_norm": 0.424083411693573,
+      "learning_rate": 3.6503534153280007e-06,
+      "loss": 1.0256061553955078,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9249661111554103,
+      "grad_norm": 0.4053768217563629,
+      "learning_rate": 3.1071589903510335e-06,
+      "loss": 1.0478339195251465,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9313451877840683,
+      "grad_norm": 0.44554078578948975,
+      "learning_rate": 2.607066257505586e-06,
+      "loss": 1.00260648727417,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9377242644127263,
+      "grad_norm": 0.42239534854888916,
+      "learning_rate": 2.1502976774274043e-06,
+      "loss": 0.9838084220886231,
+      "step": 2940
+    },
+    {
+      "epoch": 0.9441033410413843,
+      "grad_norm": 0.35276949405670166,
+      "learning_rate": 1.737056438489404e-06,
+      "loss": 0.997131633758545,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504824176700423,
+      "grad_norm": 0.47409260272979736,
+      "learning_rate": 1.3675263664156723e-06,
+      "loss": 0.9645838737487793,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568614942987003,
+      "grad_norm": 0.371419757604599,
+      "learning_rate": 1.0418718425086349e-06,
+      "loss": 1.0339078903198242,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632405709273583,
+      "grad_norm": 0.3664110004901886,
+      "learning_rate": 7.602377305258479e-07,
+      "loss": 1.018355369567871,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696196475560163,
+      "grad_norm": 0.44556254148483276,
+      "learning_rate": 5.227493122390459e-07,
+      "loss": 1.0013629913330078,
+      "step": 3040
+    },
+    {
+      "epoch": 0.9759987241846743,
+      "grad_norm": 0.4020116627216339,
+      "learning_rate": 3.295122317038768e-07,
+      "loss": 0.9894389152526856,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9823778008133323,
+      "grad_norm": 0.39281103014945984,
+      "learning_rate": 1.806124482654159e-07,
+      "loss": 1.0337581634521484,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9887568774419903,
+      "grad_norm": 0.3942895233631134,
+      "learning_rate": 7.611619832004202e-08,
+      "loss": 1.019133949279785,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9951359540706483,
+      "grad_norm": 0.3918415307998657,
+      "learning_rate": 1.606996585096221e-08,
+      "loss": 1.0425760269165039,
+      "step": 3120
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 3136,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.7721188567082496e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

sft_model/checkpoint-3136/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1032b52eb31ad96bbd04920f8f0ac477c4a3f030fd4d7d093e600fd803f6dfce
+size 5713

sft_model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b4360dd6a184650ffc48056c2569bc603f896c5adfe94b10f1c79f809638aa5
+size 11422166

sft_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [],
+  "is_local": true,
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}