phxdev commited on Jun 23, 2025

Commit

1ec00a5

verified ·

1 Parent(s): aef1313

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
README.md +113 -0
adapter_config.json +34 -0
adapter_model.safetensors +3 -0
added_tokens.json +28 -0
checkpoint-159/README.md +202 -0
checkpoint-159/adapter_config.json +34 -0
checkpoint-159/adapter_model.safetensors +3 -0
checkpoint-159/added_tokens.json +28 -0
checkpoint-159/merges.txt +0 -0
checkpoint-159/optimizer.pt +3 -0
checkpoint-159/rng_state.pth +3 -0
checkpoint-159/scheduler.pt +3 -0
checkpoint-159/special_tokens_map.json +46 -0
checkpoint-159/tokenizer.json +3 -0
checkpoint-159/tokenizer_config.json +231 -0
checkpoint-159/trainer_state.json +1146 -0
checkpoint-159/training_args.bin +3 -0
checkpoint-159/vocab.json +0 -0
checkpoint-212/README.md +202 -0
checkpoint-212/adapter_config.json +34 -0
checkpoint-212/adapter_model.safetensors +3 -0
checkpoint-212/added_tokens.json +28 -0
checkpoint-212/merges.txt +0 -0
checkpoint-212/optimizer.pt +3 -0
checkpoint-212/rng_state.pth +3 -0
checkpoint-212/scheduler.pt +3 -0
checkpoint-212/special_tokens_map.json +46 -0
checkpoint-212/tokenizer.json +3 -0
checkpoint-212/tokenizer_config.json +231 -0
checkpoint-212/trainer_state.json +1517 -0
checkpoint-212/training_args.bin +3 -0
checkpoint-212/vocab.json +0 -0
checkpoint-265/README.md +202 -0
checkpoint-265/adapter_config.json +34 -0
checkpoint-265/adapter_model.safetensors +3 -0
checkpoint-265/added_tokens.json +28 -0
checkpoint-265/merges.txt +0 -0
checkpoint-265/optimizer.pt +3 -0
checkpoint-265/rng_state.pth +3 -0
checkpoint-265/scheduler.pt +3 -0
checkpoint-265/special_tokens_map.json +46 -0
checkpoint-265/tokenizer.json +3 -0
checkpoint-265/tokenizer_config.json +231 -0
checkpoint-265/trainer_state.json +1888 -0
checkpoint-265/training_args.bin +3 -0
checkpoint-265/vocab.json +0 -0
checkpoint-312/README.md +202 -0
checkpoint-312/adapter_config.json +34 -0
checkpoint-312/adapter_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-159/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-212/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-265/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-312/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,113 @@

+---
+library_name: peft
+license: apache-2.0
+base_model: Qwen/Qwen2.5-0.5B
+tags:
+- generated_from_trainer
+datasets:
+- phxdev/creed
+model-index:
+- name: creed-qwen-0.5b-lora
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.8.0.dev0`
+```yaml
+base_model: Qwen/Qwen2.5-0.5B
+model_type: Qwen2ForCausalLM
+datasets:
+  - path: phxdev/creed
+    type: completion
+    field: text
+output_dir: ./creed-qwen-0.5b-lora
+adapter: lora
+lora_r: 16
+lora_alpha: 32
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+micro_batch_size: 4
+gradient_accumulation_steps: 4
+num_epochs: 6
+learning_rate: 2e-4
+special_tokens:
+  additional_special_tokens:
+    - "<thinking>"
+    - "</thinking>"
+    - "<tangent>"
+    - "<conspiracy>"
+```
+</details><br>
+# creed-qwen-0.5b-lora
+This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) on the phxdev/creed dataset, trained to embody the philosophical and conspiratorial musings of Creed Bratton from The Office.
+## Model description
+This LoRA adapter transforms Qwen2.5-0.5B into a model that captures Creed's unique perspective on life, complete with:
+- Bizarre tangential stories about his past
+- Questionable business ventures and schemes
+- Deep philosophical insights mixed with complete nonsense
+- References to his mysterious and possibly criminal background
+The model uses special tokens `<thinking>`, `</thinking>`, `<tangent>`, and `<conspiracy>` to structure Creed's unique thought patterns.
+## Intended uses & limitations
+**Intended uses:**
+- Entertainment and creative writing in the style of Creed Bratton
+- Generating humorous, offbeat responses
+- Exploring unconventional perspectives on everyday topics
+**Limitations:**
+- This is a character model - responses should not be taken as factual
+- May generate inappropriate or nonsensical content (that's kind of the point)
+- Not suitable for serious advice or factual information
+## Training and evaluation data
+Trained on the phxdev/creed dataset, which contains curated examples of Creed-style responses, philosophical musings, and tangential stories.
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 16
+- optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 9
+- num_epochs: 6.0
+### Training results
+### Framework versions
+- PEFT 0.14.0
+- Transformers 4.49.0
+- Pytorch 2.5.1+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-0.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b35702578df1a77f49fbaf57a0081dd8c817e7fe1a66b861ce989c67e99d0c2
+size 8676008

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</thinking>": 151666,
+  "</tool_call>": 151658,
+  "<conspiracy>": 151668,
+  "<tangent>": 151667,
+  "<thinking>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-159/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-0.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-159/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-0.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-159/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d248a90c798121a599538eaae1adac6dd0c47f65c535276f289b2a865ecf3811
+size 8676008

checkpoint-159/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</thinking>": 151666,
+  "</tool_call>": 151658,
+  "<conspiracy>": 151668,
+  "<tangent>": 151667,
+  "<thinking>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-159/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-159/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d1f6cd27a5d282115da91a93556f4c93f3253750b509b1abac28ce339725129
+size 17414842

checkpoint-159/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
+size 14244

checkpoint-159/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6da9404b1e9a931261bcbe4d4974ed209914e0fab12f4bbfc7392d9718599e6
+size 1064

checkpoint-159/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-159/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d6915fdc377a021898a989de25f3e54ffcc6c1e9497f10812eb8a4504a7f01
+size 11422646

checkpoint-159/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<thinking>",
+    "</thinking>",
+    "<tangent>",
+    "<conspiracy>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-159/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1146 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 159,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019138755980861243,
+      "grad_norm": 3.7146408557891846,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 4.4869,
+      "step": 1
+    },
+    {
+      "epoch": 0.03827751196172249,
+      "grad_norm": 3.3118133544921875,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 4.1867,
+      "step": 2
+    },
+    {
+      "epoch": 0.05741626794258373,
+      "grad_norm": 2.972708225250244,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 4.001,
+      "step": 3
+    },
+    {
+      "epoch": 0.07655502392344497,
+      "grad_norm": 4.938202381134033,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 5.0582,
+      "step": 4
+    },
+    {
+      "epoch": 0.09569377990430622,
+      "grad_norm": 3.5732812881469727,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 4.5871,
+      "step": 5
+    },
+    {
+      "epoch": 0.11483253588516747,
+      "grad_norm": 3.350315570831299,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 4.0071,
+      "step": 6
+    },
+    {
+      "epoch": 0.1339712918660287,
+      "grad_norm": 3.4415643215179443,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 3.9791,
+      "step": 7
+    },
+    {
+      "epoch": 0.15311004784688995,
+      "grad_norm": 2.558781385421753,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 3.6497,
+      "step": 8
+    },
+    {
+      "epoch": 0.1722488038277512,
+      "grad_norm": 2.3021087646484375,
+      "learning_rate": 0.0002,
+      "loss": 3.5205,
+      "step": 9
+    },
+    {
+      "epoch": 0.19138755980861244,
+      "grad_norm": 2.301999568939209,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 4.112,
+      "step": 10
+    },
+    {
+      "epoch": 0.21052631578947367,
+      "grad_norm": 3.0552637577056885,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 3.8723,
+      "step": 11
+    },
+    {
+      "epoch": 0.22966507177033493,
+      "grad_norm": 2.5972537994384766,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 3.8135,
+      "step": 12
+    },
+    {
+      "epoch": 0.24880382775119617,
+      "grad_norm": 2.0281920433044434,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 3.8702,
+      "step": 13
+    },
+    {
+      "epoch": 0.2679425837320574,
+      "grad_norm": 1.7147849798202515,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 3.463,
+      "step": 14
+    },
+    {
+      "epoch": 0.28708133971291866,
+      "grad_norm": 2.082582473754883,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 3.3816,
+      "step": 15
+    },
+    {
+      "epoch": 0.3062200956937799,
+      "grad_norm": 2.1299426555633545,
+      "learning_rate": 0.00019973673694024,
+      "loss": 3.698,
+      "step": 16
+    },
+    {
+      "epoch": 0.3253588516746411,
+      "grad_norm": 1.8626389503479004,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 3.3583,
+      "step": 17
+    },
+    {
+      "epoch": 0.3444976076555024,
+      "grad_norm": 2.452871322631836,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 3.4957,
+      "step": 18
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 2.265108585357666,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 3.1115,
+      "step": 19
+    },
+    {
+      "epoch": 0.3827751196172249,
+      "grad_norm": 1.996728777885437,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 3.4159,
+      "step": 20
+    },
+    {
+      "epoch": 0.4019138755980861,
+      "grad_norm": 1.913594365119934,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 3.3288,
+      "step": 21
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "grad_norm": 2.4316132068634033,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 3.4245,
+      "step": 22
+    },
+    {
+      "epoch": 0.44019138755980863,
+      "grad_norm": 1.998693823814392,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 3.1687,
+      "step": 23
+    },
+    {
+      "epoch": 0.45933014354066987,
+      "grad_norm": 2.21382737159729,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 3.4307,
+      "step": 24
+    },
+    {
+      "epoch": 0.4784688995215311,
+      "grad_norm": 2.586013078689575,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 3.3596,
+      "step": 25
+    },
+    {
+      "epoch": 0.49760765550239233,
+      "grad_norm": 2.8244550228118896,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 3.3107,
+      "step": 26
+    },
+    {
+      "epoch": 0.5167464114832536,
+      "grad_norm": 2.0228700637817383,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 3.1749,
+      "step": 27
+    },
+    {
+      "epoch": 0.5358851674641149,
+      "grad_norm": 2.7035820484161377,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 3.0995,
+      "step": 28
+    },
+    {
+      "epoch": 0.5550239234449761,
+      "grad_norm": 2.119741916656494,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 3.225,
+      "step": 29
+    },
+    {
+      "epoch": 0.5741626794258373,
+      "grad_norm": 2.5071310997009277,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 3.0066,
+      "step": 30
+    },
+    {
+      "epoch": 0.5933014354066986,
+      "grad_norm": 2.992201566696167,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 3.0412,
+      "step": 31
+    },
+    {
+      "epoch": 0.6124401913875598,
+      "grad_norm": 2.820875883102417,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 3.3575,
+      "step": 32
+    },
+    {
+      "epoch": 0.631578947368421,
+      "grad_norm": 2.7096059322357178,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 3.3531,
+      "step": 33
+    },
+    {
+      "epoch": 0.6507177033492823,
+      "grad_norm": 2.172783374786377,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 3.2852,
+      "step": 34
+    },
+    {
+      "epoch": 0.6698564593301436,
+      "grad_norm": 3.238025188446045,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 3.3818,
+      "step": 35
+    },
+    {
+      "epoch": 0.6889952153110048,
+      "grad_norm": 2.92851185798645,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 3.1,
+      "step": 36
+    },
+    {
+      "epoch": 0.7081339712918661,
+      "grad_norm": 2.514800786972046,
+      "learning_rate": 0.000195815455670239,
+      "loss": 3.0883,
+      "step": 37
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 3.264613151550293,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 2.9932,
+      "step": 38
+    },
+    {
+      "epoch": 0.7464114832535885,
+      "grad_norm": 2.4111247062683105,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 2.974,
+      "step": 39
+    },
+    {
+      "epoch": 0.7655502392344498,
+      "grad_norm": 2.692473888397217,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 2.8822,
+      "step": 40
+    },
+    {
+      "epoch": 0.784688995215311,
+      "grad_norm": 3.3863365650177,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 3.0255,
+      "step": 41
+    },
+    {
+      "epoch": 0.8038277511961722,
+      "grad_norm": 2.2267720699310303,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 3.1883,
+      "step": 42
+    },
+    {
+      "epoch": 0.8229665071770335,
+      "grad_norm": 2.31858491897583,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 2.9882,
+      "step": 43
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 2.3098323345184326,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 2.8801,
+      "step": 44
+    },
+    {
+      "epoch": 0.861244019138756,
+      "grad_norm": 3.3286585807800293,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 2.9762,
+      "step": 45
+    },
+    {
+      "epoch": 0.8803827751196173,
+      "grad_norm": 3.1082146167755127,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 2.8514,
+      "step": 46
+    },
+    {
+      "epoch": 0.8995215311004785,
+      "grad_norm": 2.2908411026000977,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 3.2993,
+      "step": 47
+    },
+    {
+      "epoch": 0.9186602870813397,
+      "grad_norm": 2.1068387031555176,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 3.0606,
+      "step": 48
+    },
+    {
+      "epoch": 0.937799043062201,
+      "grad_norm": 2.951885938644409,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 3.042,
+      "step": 49
+    },
+    {
+      "epoch": 0.9569377990430622,
+      "grad_norm": 2.2476351261138916,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 2.9928,
+      "step": 50
+    },
+    {
+      "epoch": 0.9760765550239234,
+      "grad_norm": 1.9801242351531982,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 2.8911,
+      "step": 51
+    },
+    {
+      "epoch": 0.9952153110047847,
+      "grad_norm": 2.5246548652648926,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 3.2457,
+      "step": 52
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.682666778564453,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 2.4435,
+      "step": 53
+    },
+    {
+      "epoch": 1.0191387559808613,
+      "grad_norm": 2.66831374168396,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 2.7807,
+      "step": 54
+    },
+    {
+      "epoch": 1.0382775119617225,
+      "grad_norm": 2.5246026515960693,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 2.7599,
+      "step": 55
+    },
+    {
+      "epoch": 1.0574162679425838,
+      "grad_norm": 1.959625244140625,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 2.9087,
+      "step": 56
+    },
+    {
+      "epoch": 1.076555023923445,
+      "grad_norm": 2.277261257171631,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 3.2283,
+      "step": 57
+    },
+    {
+      "epoch": 1.0956937799043063,
+      "grad_norm": 3.0258898735046387,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 3.0084,
+      "step": 58
+    },
+    {
+      "epoch": 1.1148325358851674,
+      "grad_norm": 2.4277517795562744,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 2.7159,
+      "step": 59
+    },
+    {
+      "epoch": 1.1339712918660287,
+      "grad_norm": 3.0732321739196777,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 2.9704,
+      "step": 60
+    },
+    {
+      "epoch": 1.1531100478468899,
+      "grad_norm": 3.0256996154785156,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 3.0254,
+      "step": 61
+    },
+    {
+      "epoch": 1.1722488038277512,
+      "grad_norm": 2.7575695514678955,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 2.4907,
+      "step": 62
+    },
+    {
+      "epoch": 1.1913875598086126,
+      "grad_norm": 2.813037157058716,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 2.8598,
+      "step": 63
+    },
+    {
+      "epoch": 1.2105263157894737,
+      "grad_norm": 2.197244644165039,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 2.965,
+      "step": 64
+    },
+    {
+      "epoch": 1.229665071770335,
+      "grad_norm": 2.0711350440979004,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 2.9582,
+      "step": 65
+    },
+    {
+      "epoch": 1.2488038277511961,
+      "grad_norm": 2.7295780181884766,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 2.5148,
+      "step": 66
+    },
+    {
+      "epoch": 1.2679425837320575,
+      "grad_norm": 2.511603593826294,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 2.7535,
+      "step": 67
+    },
+    {
+      "epoch": 1.2870813397129186,
+      "grad_norm": 3.695086717605591,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 2.6003,
+      "step": 68
+    },
+    {
+      "epoch": 1.30622009569378,
+      "grad_norm": 3.2395761013031006,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 2.7334,
+      "step": 69
+    },
+    {
+      "epoch": 1.325358851674641,
+      "grad_norm": 3.004142999649048,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 2.7788,
+      "step": 70
+    },
+    {
+      "epoch": 1.3444976076555024,
+      "grad_norm": 2.964301824569702,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 2.734,
+      "step": 71
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 3.981093645095825,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 2.7476,
+      "step": 72
+    },
+    {
+      "epoch": 1.3827751196172249,
+      "grad_norm": 3.2536420822143555,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 2.5386,
+      "step": 73
+    },
+    {
+      "epoch": 1.401913875598086,
+      "grad_norm": 3.6163337230682373,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 2.6767,
+      "step": 74
+    },
+    {
+      "epoch": 1.4210526315789473,
+      "grad_norm": 3.6883926391601562,
+      "learning_rate": 0.000177485710710289,
+      "loss": 2.5656,
+      "step": 75
+    },
+    {
+      "epoch": 1.4401913875598087,
+      "grad_norm": 3.5389018058776855,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 2.6626,
+      "step": 76
+    },
+    {
+      "epoch": 1.4593301435406698,
+      "grad_norm": 2.324506998062134,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 2.6479,
+      "step": 77
+    },
+    {
+      "epoch": 1.4784688995215312,
+      "grad_norm": 2.271515130996704,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 2.874,
+      "step": 78
+    },
+    {
+      "epoch": 1.4976076555023923,
+      "grad_norm": 3.023533821105957,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 2.4017,
+      "step": 79
+    },
+    {
+      "epoch": 1.5167464114832536,
+      "grad_norm": 4.101243495941162,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 3.0185,
+      "step": 80
+    },
+    {
+      "epoch": 1.535885167464115,
+      "grad_norm": 3.056877374649048,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 2.7875,
+      "step": 81
+    },
+    {
+      "epoch": 1.555023923444976,
+      "grad_norm": 3.0255823135375977,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 2.8453,
+      "step": 82
+    },
+    {
+      "epoch": 1.5741626794258372,
+      "grad_norm": 3.57423734664917,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 2.6948,
+      "step": 83
+    },
+    {
+      "epoch": 1.5933014354066986,
+      "grad_norm": 3.436167001724243,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 2.6287,
+      "step": 84
+    },
+    {
+      "epoch": 1.61244019138756,
+      "grad_norm": 3.1058871746063232,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 2.5887,
+      "step": 85
+    },
+    {
+      "epoch": 1.631578947368421,
+      "grad_norm": 2.1073200702667236,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 2.9573,
+      "step": 86
+    },
+    {
+      "epoch": 1.6507177033492821,
+      "grad_norm": 2.8039920330047607,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 2.9652,
+      "step": 87
+    },
+    {
+      "epoch": 1.6698564593301435,
+      "grad_norm": 2.8494677543640137,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 2.6263,
+      "step": 88
+    },
+    {
+      "epoch": 1.6889952153110048,
+      "grad_norm": 2.3521246910095215,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 2.683,
+      "step": 89
+    },
+    {
+      "epoch": 1.7081339712918662,
+      "grad_norm": 2.5750181674957275,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 2.611,
+      "step": 90
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 2.687619924545288,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 2.523,
+      "step": 91
+    },
+    {
+      "epoch": 1.7464114832535884,
+      "grad_norm": 3.112954616546631,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 2.6558,
+      "step": 92
+    },
+    {
+      "epoch": 1.7655502392344498,
+      "grad_norm": 3.4932713508605957,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 2.6962,
+      "step": 93
+    },
+    {
+      "epoch": 1.784688995215311,
+      "grad_norm": 2.564894437789917,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 2.8202,
+      "step": 94
+    },
+    {
+      "epoch": 1.8038277511961722,
+      "grad_norm": 3.1496503353118896,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 2.2734,
+      "step": 95
+    },
+    {
+      "epoch": 1.8229665071770333,
+      "grad_norm": 2.6274123191833496,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 2.5264,
+      "step": 96
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 2.374180555343628,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 2.8034,
+      "step": 97
+    },
+    {
+      "epoch": 1.861244019138756,
+      "grad_norm": 2.691254138946533,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 2.5935,
+      "step": 98
+    },
+    {
+      "epoch": 1.8803827751196174,
+      "grad_norm": 2.9795515537261963,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 2.8707,
+      "step": 99
+    },
+    {
+      "epoch": 1.8995215311004785,
+      "grad_norm": 3.1781864166259766,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 2.458,
+      "step": 100
+    },
+    {
+      "epoch": 1.9186602870813396,
+      "grad_norm": 2.8759453296661377,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 2.5201,
+      "step": 101
+    },
+    {
+      "epoch": 1.937799043062201,
+      "grad_norm": 3.2317118644714355,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 2.5585,
+      "step": 102
+    },
+    {
+      "epoch": 1.9569377990430623,
+      "grad_norm": 3.463688373565674,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 2.4652,
+      "step": 103
+    },
+    {
+      "epoch": 1.9760765550239234,
+      "grad_norm": 2.4766316413879395,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 2.5966,
+      "step": 104
+    },
+    {
+      "epoch": 1.9952153110047846,
+      "grad_norm": 2.8042709827423096,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 2.5278,
+      "step": 105
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.298028945922852,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 2.8732,
+      "step": 106
+    },
+    {
+      "epoch": 2.0191387559808613,
+      "grad_norm": 3.808393716812134,
+      "learning_rate": 0.000152669141192587,
+      "loss": 2.1442,
+      "step": 107
+    },
+    {
+      "epoch": 2.0382775119617227,
+      "grad_norm": 3.3381223678588867,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 2.6125,
+      "step": 108
+    },
+    {
+      "epoch": 2.0574162679425836,
+      "grad_norm": 4.778241157531738,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 2.399,
+      "step": 109
+    },
+    {
+      "epoch": 2.076555023923445,
+      "grad_norm": 2.613919973373413,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 2.5189,
+      "step": 110
+    },
+    {
+      "epoch": 2.0956937799043063,
+      "grad_norm": 3.6656932830810547,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 2.2785,
+      "step": 111
+    },
+    {
+      "epoch": 2.1148325358851676,
+      "grad_norm": 2.968078136444092,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 2.3605,
+      "step": 112
+    },
+    {
+      "epoch": 2.1339712918660285,
+      "grad_norm": 2.7252070903778076,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 2.173,
+      "step": 113
+    },
+    {
+      "epoch": 2.15311004784689,
+      "grad_norm": 3.9389491081237793,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 2.457,
+      "step": 114
+    },
+    {
+      "epoch": 2.172248803827751,
+      "grad_norm": 3.658862590789795,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 2.6971,
+      "step": 115
+    },
+    {
+      "epoch": 2.1913875598086126,
+      "grad_norm": 3.303403377532959,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 2.7851,
+      "step": 116
+    },
+    {
+      "epoch": 2.2105263157894735,
+      "grad_norm": 3.910428047180176,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 2.1422,
+      "step": 117
+    },
+    {
+      "epoch": 2.229665071770335,
+      "grad_norm": 3.3043367862701416,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 2.5404,
+      "step": 118
+    },
+    {
+      "epoch": 2.248803827751196,
+      "grad_norm": 2.9098987579345703,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 2.567,
+      "step": 119
+    },
+    {
+      "epoch": 2.2679425837320575,
+      "grad_norm": 4.142232894897461,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 2.4179,
+      "step": 120
+    },
+    {
+      "epoch": 2.287081339712919,
+      "grad_norm": 2.110104560852051,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 2.4976,
+      "step": 121
+    },
+    {
+      "epoch": 2.3062200956937797,
+      "grad_norm": 2.6828229427337646,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 2.5762,
+      "step": 122
+    },
+    {
+      "epoch": 2.325358851674641,
+      "grad_norm": 3.0066471099853516,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 2.2882,
+      "step": 123
+    },
+    {
+      "epoch": 2.3444976076555024,
+      "grad_norm": 3.791444778442383,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 2.1788,
+      "step": 124
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 2.78275203704834,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 2.6037,
+      "step": 125
+    },
+    {
+      "epoch": 2.382775119617225,
+      "grad_norm": 4.18953275680542,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 2.3284,
+      "step": 126
+    },
+    {
+      "epoch": 2.401913875598086,
+      "grad_norm": 2.925140142440796,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 2.4364,
+      "step": 127
+    },
+    {
+      "epoch": 2.4210526315789473,
+      "grad_norm": 4.545037746429443,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 2.4096,
+      "step": 128
+    },
+    {
+      "epoch": 2.4401913875598087,
+      "grad_norm": 3.785428524017334,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 2.1047,
+      "step": 129
+    },
+    {
+      "epoch": 2.45933014354067,
+      "grad_norm": 3.6228346824645996,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 2.5744,
+      "step": 130
+    },
+    {
+      "epoch": 2.478468899521531,
+      "grad_norm": 2.9221742153167725,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 2.5266,
+      "step": 131
+    },
+    {
+      "epoch": 2.4976076555023923,
+      "grad_norm": 3.659484386444092,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 2.3707,
+      "step": 132
+    },
+    {
+      "epoch": 2.5167464114832536,
+      "grad_norm": 3.5442514419555664,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 2.4695,
+      "step": 133
+    },
+    {
+      "epoch": 2.535885167464115,
+      "grad_norm": 3.1291420459747314,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 2.687,
+      "step": 134
+    },
+    {
+      "epoch": 2.555023923444976,
+      "grad_norm": 4.138225078582764,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 2.2378,
+      "step": 135
+    },
+    {
+      "epoch": 2.574162679425837,
+      "grad_norm": 2.8483548164367676,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 2.4044,
+      "step": 136
+    },
+    {
+      "epoch": 2.5933014354066986,
+      "grad_norm": 2.434741497039795,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 2.5664,
+      "step": 137
+    },
+    {
+      "epoch": 2.61244019138756,
+      "grad_norm": 3.9319725036621094,
+      "learning_rate": 0.000123117632211497,
+      "loss": 2.2586,
+      "step": 138
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 3.4802486896514893,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 2.0743,
+      "step": 139
+    },
+    {
+      "epoch": 2.650717703349282,
+      "grad_norm": 3.1535286903381348,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 2.302,
+      "step": 140
+    },
+    {
+      "epoch": 2.6698564593301435,
+      "grad_norm": 2.9818458557128906,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 2.4418,
+      "step": 141
+    },
+    {
+      "epoch": 2.688995215311005,
+      "grad_norm": 4.8768630027771,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 2.5083,
+      "step": 142
+    },
+    {
+      "epoch": 2.708133971291866,
+      "grad_norm": 3.8520619869232178,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.9994,
+      "step": 143
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 3.784248113632202,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 2.2376,
+      "step": 144
+    },
+    {
+      "epoch": 2.7464114832535884,
+      "grad_norm": 4.1650800704956055,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 2.3886,
+      "step": 145
+    },
+    {
+      "epoch": 2.7655502392344498,
+      "grad_norm": 4.099468231201172,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 2.5978,
+      "step": 146
+    },
+    {
+      "epoch": 2.784688995215311,
+      "grad_norm": 4.268674850463867,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 2.4882,
+      "step": 147
+    },
+    {
+      "epoch": 2.803827751196172,
+      "grad_norm": 4.081464767456055,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 2.6547,
+      "step": 148
+    },
+    {
+      "epoch": 2.8229665071770333,
+      "grad_norm": 3.1537716388702393,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 2.5361,
+      "step": 149
+    },
+    {
+      "epoch": 2.8421052631578947,
+      "grad_norm": 4.182295322418213,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 2.385,
+      "step": 150
+    },
+    {
+      "epoch": 2.861244019138756,
+      "grad_norm": 2.5511474609375,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 2.4617,
+      "step": 151
+    },
+    {
+      "epoch": 2.8803827751196174,
+      "grad_norm": 3.1007962226867676,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 2.5788,
+      "step": 152
+    },
+    {
+      "epoch": 2.8995215311004783,
+      "grad_norm": 4.509490966796875,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 2.1361,
+      "step": 153
+    },
+    {
+      "epoch": 2.9186602870813396,
+      "grad_norm": 2.6765851974487305,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 2.3922,
+      "step": 154
+    },
+    {
+      "epoch": 2.937799043062201,
+      "grad_norm": 3.704310894012451,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 2.6924,
+      "step": 155
+    },
+    {
+      "epoch": 2.9569377990430623,
+      "grad_norm": 3.935804843902588,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 2.2886,
+      "step": 156
+    },
+    {
+      "epoch": 2.9760765550239237,
+      "grad_norm": 4.105613708496094,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 2.3865,
+      "step": 157
+    },
+    {
+      "epoch": 2.9952153110047846,
+      "grad_norm": 3.669766664505005,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 2.2158,
+      "step": 158
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 8.930411338806152,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 2.5214,
+      "step": 159
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 444654924595200.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-159/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8d8f75328c89f0f8d97ecc3fb21f0a76fa9b188979afd06060c2f286d07806
+size 6456

checkpoint-159/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-212/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-0.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-212/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-0.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-212/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7dbdc2e7d984b5dc158bc10664105c60721ba227ba3a7b3353aef6f8460d18
+size 8676008

checkpoint-212/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</thinking>": 151666,
+  "</tool_call>": 151658,
+  "<conspiracy>": 151668,
+  "<tangent>": 151667,
+  "<thinking>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-212/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-212/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ff395a5f407d4468dd57ad28e9751fa2588681befa64d1b9ba640a6853dea1e
+size 17414842

checkpoint-212/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
+size 14244

checkpoint-212/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:288192b9ff441685e8033e99a50e2c6d9e1d9e489dfcdc015ea371fcfe3641af
+size 1064

checkpoint-212/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-212/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d6915fdc377a021898a989de25f3e54ffcc6c1e9497f10812eb8a4504a7f01
+size 11422646

checkpoint-212/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<thinking>",
+    "</thinking>",
+    "<tangent>",
+    "<conspiracy>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-212/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1517 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 212,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019138755980861243,
+      "grad_norm": 3.7146408557891846,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 4.4869,
+      "step": 1
+    },
+    {
+      "epoch": 0.03827751196172249,
+      "grad_norm": 3.3118133544921875,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 4.1867,
+      "step": 2
+    },
+    {
+      "epoch": 0.05741626794258373,
+      "grad_norm": 2.972708225250244,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 4.001,
+      "step": 3
+    },
+    {
+      "epoch": 0.07655502392344497,
+      "grad_norm": 4.938202381134033,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 5.0582,
+      "step": 4
+    },
+    {
+      "epoch": 0.09569377990430622,
+      "grad_norm": 3.5732812881469727,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 4.5871,
+      "step": 5
+    },
+    {
+      "epoch": 0.11483253588516747,
+      "grad_norm": 3.350315570831299,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 4.0071,
+      "step": 6
+    },
+    {
+      "epoch": 0.1339712918660287,
+      "grad_norm": 3.4415643215179443,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 3.9791,
+      "step": 7
+    },
+    {
+      "epoch": 0.15311004784688995,
+      "grad_norm": 2.558781385421753,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 3.6497,
+      "step": 8
+    },
+    {
+      "epoch": 0.1722488038277512,
+      "grad_norm": 2.3021087646484375,
+      "learning_rate": 0.0002,
+      "loss": 3.5205,
+      "step": 9
+    },
+    {
+      "epoch": 0.19138755980861244,
+      "grad_norm": 2.301999568939209,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 4.112,
+      "step": 10
+    },
+    {
+      "epoch": 0.21052631578947367,
+      "grad_norm": 3.0552637577056885,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 3.8723,
+      "step": 11
+    },
+    {
+      "epoch": 0.22966507177033493,
+      "grad_norm": 2.5972537994384766,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 3.8135,
+      "step": 12
+    },
+    {
+      "epoch": 0.24880382775119617,
+      "grad_norm": 2.0281920433044434,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 3.8702,
+      "step": 13
+    },
+    {
+      "epoch": 0.2679425837320574,
+      "grad_norm": 1.7147849798202515,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 3.463,
+      "step": 14
+    },
+    {
+      "epoch": 0.28708133971291866,
+      "grad_norm": 2.082582473754883,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 3.3816,
+      "step": 15
+    },
+    {
+      "epoch": 0.3062200956937799,
+      "grad_norm": 2.1299426555633545,
+      "learning_rate": 0.00019973673694024,
+      "loss": 3.698,
+      "step": 16
+    },
+    {
+      "epoch": 0.3253588516746411,
+      "grad_norm": 1.8626389503479004,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 3.3583,
+      "step": 17
+    },
+    {
+      "epoch": 0.3444976076555024,
+      "grad_norm": 2.452871322631836,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 3.4957,
+      "step": 18
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 2.265108585357666,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 3.1115,
+      "step": 19
+    },
+    {
+      "epoch": 0.3827751196172249,
+      "grad_norm": 1.996728777885437,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 3.4159,
+      "step": 20
+    },
+    {
+      "epoch": 0.4019138755980861,
+      "grad_norm": 1.913594365119934,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 3.3288,
+      "step": 21
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "grad_norm": 2.4316132068634033,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 3.4245,
+      "step": 22
+    },
+    {
+      "epoch": 0.44019138755980863,
+      "grad_norm": 1.998693823814392,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 3.1687,
+      "step": 23
+    },
+    {
+      "epoch": 0.45933014354066987,
+      "grad_norm": 2.21382737159729,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 3.4307,
+      "step": 24
+    },
+    {
+      "epoch": 0.4784688995215311,
+      "grad_norm": 2.586013078689575,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 3.3596,
+      "step": 25
+    },
+    {
+      "epoch": 0.49760765550239233,
+      "grad_norm": 2.8244550228118896,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 3.3107,
+      "step": 26
+    },
+    {
+      "epoch": 0.5167464114832536,
+      "grad_norm": 2.0228700637817383,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 3.1749,
+      "step": 27
+    },
+    {
+      "epoch": 0.5358851674641149,
+      "grad_norm": 2.7035820484161377,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 3.0995,
+      "step": 28
+    },
+    {
+      "epoch": 0.5550239234449761,
+      "grad_norm": 2.119741916656494,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 3.225,
+      "step": 29
+    },
+    {
+      "epoch": 0.5741626794258373,
+      "grad_norm": 2.5071310997009277,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 3.0066,
+      "step": 30
+    },
+    {
+      "epoch": 0.5933014354066986,
+      "grad_norm": 2.992201566696167,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 3.0412,
+      "step": 31
+    },
+    {
+      "epoch": 0.6124401913875598,
+      "grad_norm": 2.820875883102417,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 3.3575,
+      "step": 32
+    },
+    {
+      "epoch": 0.631578947368421,
+      "grad_norm": 2.7096059322357178,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 3.3531,
+      "step": 33
+    },
+    {
+      "epoch": 0.6507177033492823,
+      "grad_norm": 2.172783374786377,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 3.2852,
+      "step": 34
+    },
+    {
+      "epoch": 0.6698564593301436,
+      "grad_norm": 3.238025188446045,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 3.3818,
+      "step": 35
+    },
+    {
+      "epoch": 0.6889952153110048,
+      "grad_norm": 2.92851185798645,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 3.1,
+      "step": 36
+    },
+    {
+      "epoch": 0.7081339712918661,
+      "grad_norm": 2.514800786972046,
+      "learning_rate": 0.000195815455670239,
+      "loss": 3.0883,
+      "step": 37
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 3.264613151550293,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 2.9932,
+      "step": 38
+    },
+    {
+      "epoch": 0.7464114832535885,
+      "grad_norm": 2.4111247062683105,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 2.974,
+      "step": 39
+    },
+    {
+      "epoch": 0.7655502392344498,
+      "grad_norm": 2.692473888397217,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 2.8822,
+      "step": 40
+    },
+    {
+      "epoch": 0.784688995215311,
+      "grad_norm": 3.3863365650177,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 3.0255,
+      "step": 41
+    },
+    {
+      "epoch": 0.8038277511961722,
+      "grad_norm": 2.2267720699310303,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 3.1883,
+      "step": 42
+    },
+    {
+      "epoch": 0.8229665071770335,
+      "grad_norm": 2.31858491897583,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 2.9882,
+      "step": 43
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 2.3098323345184326,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 2.8801,
+      "step": 44
+    },
+    {
+      "epoch": 0.861244019138756,
+      "grad_norm": 3.3286585807800293,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 2.9762,
+      "step": 45
+    },
+    {
+      "epoch": 0.8803827751196173,
+      "grad_norm": 3.1082146167755127,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 2.8514,
+      "step": 46
+    },
+    {
+      "epoch": 0.8995215311004785,
+      "grad_norm": 2.2908411026000977,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 3.2993,
+      "step": 47
+    },
+    {
+      "epoch": 0.9186602870813397,
+      "grad_norm": 2.1068387031555176,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 3.0606,
+      "step": 48
+    },
+    {
+      "epoch": 0.937799043062201,
+      "grad_norm": 2.951885938644409,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 3.042,
+      "step": 49
+    },
+    {
+      "epoch": 0.9569377990430622,
+      "grad_norm": 2.2476351261138916,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 2.9928,
+      "step": 50
+    },
+    {
+      "epoch": 0.9760765550239234,
+      "grad_norm": 1.9801242351531982,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 2.8911,
+      "step": 51
+    },
+    {
+      "epoch": 0.9952153110047847,
+      "grad_norm": 2.5246548652648926,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 3.2457,
+      "step": 52
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.682666778564453,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 2.4435,
+      "step": 53
+    },
+    {
+      "epoch": 1.0191387559808613,
+      "grad_norm": 2.66831374168396,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 2.7807,
+      "step": 54
+    },
+    {
+      "epoch": 1.0382775119617225,
+      "grad_norm": 2.5246026515960693,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 2.7599,
+      "step": 55
+    },
+    {
+      "epoch": 1.0574162679425838,
+      "grad_norm": 1.959625244140625,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 2.9087,
+      "step": 56
+    },
+    {
+      "epoch": 1.076555023923445,
+      "grad_norm": 2.277261257171631,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 3.2283,
+      "step": 57
+    },
+    {
+      "epoch": 1.0956937799043063,
+      "grad_norm": 3.0258898735046387,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 3.0084,
+      "step": 58
+    },
+    {
+      "epoch": 1.1148325358851674,
+      "grad_norm": 2.4277517795562744,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 2.7159,
+      "step": 59
+    },
+    {
+      "epoch": 1.1339712918660287,
+      "grad_norm": 3.0732321739196777,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 2.9704,
+      "step": 60
+    },
+    {
+      "epoch": 1.1531100478468899,
+      "grad_norm": 3.0256996154785156,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 3.0254,
+      "step": 61
+    },
+    {
+      "epoch": 1.1722488038277512,
+      "grad_norm": 2.7575695514678955,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 2.4907,
+      "step": 62
+    },
+    {
+      "epoch": 1.1913875598086126,
+      "grad_norm": 2.813037157058716,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 2.8598,
+      "step": 63
+    },
+    {
+      "epoch": 1.2105263157894737,
+      "grad_norm": 2.197244644165039,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 2.965,
+      "step": 64
+    },
+    {
+      "epoch": 1.229665071770335,
+      "grad_norm": 2.0711350440979004,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 2.9582,
+      "step": 65
+    },
+    {
+      "epoch": 1.2488038277511961,
+      "grad_norm": 2.7295780181884766,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 2.5148,
+      "step": 66
+    },
+    {
+      "epoch": 1.2679425837320575,
+      "grad_norm": 2.511603593826294,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 2.7535,
+      "step": 67
+    },
+    {
+      "epoch": 1.2870813397129186,
+      "grad_norm": 3.695086717605591,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 2.6003,
+      "step": 68
+    },
+    {
+      "epoch": 1.30622009569378,
+      "grad_norm": 3.2395761013031006,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 2.7334,
+      "step": 69
+    },
+    {
+      "epoch": 1.325358851674641,
+      "grad_norm": 3.004142999649048,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 2.7788,
+      "step": 70
+    },
+    {
+      "epoch": 1.3444976076555024,
+      "grad_norm": 2.964301824569702,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 2.734,
+      "step": 71
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 3.981093645095825,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 2.7476,
+      "step": 72
+    },
+    {
+      "epoch": 1.3827751196172249,
+      "grad_norm": 3.2536420822143555,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 2.5386,
+      "step": 73
+    },
+    {
+      "epoch": 1.401913875598086,
+      "grad_norm": 3.6163337230682373,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 2.6767,
+      "step": 74
+    },
+    {
+      "epoch": 1.4210526315789473,
+      "grad_norm": 3.6883926391601562,
+      "learning_rate": 0.000177485710710289,
+      "loss": 2.5656,
+      "step": 75
+    },
+    {
+      "epoch": 1.4401913875598087,
+      "grad_norm": 3.5389018058776855,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 2.6626,
+      "step": 76
+    },
+    {
+      "epoch": 1.4593301435406698,
+      "grad_norm": 2.324506998062134,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 2.6479,
+      "step": 77
+    },
+    {
+      "epoch": 1.4784688995215312,
+      "grad_norm": 2.271515130996704,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 2.874,
+      "step": 78
+    },
+    {
+      "epoch": 1.4976076555023923,
+      "grad_norm": 3.023533821105957,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 2.4017,
+      "step": 79
+    },
+    {
+      "epoch": 1.5167464114832536,
+      "grad_norm": 4.101243495941162,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 3.0185,
+      "step": 80
+    },
+    {
+      "epoch": 1.535885167464115,
+      "grad_norm": 3.056877374649048,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 2.7875,
+      "step": 81
+    },
+    {
+      "epoch": 1.555023923444976,
+      "grad_norm": 3.0255823135375977,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 2.8453,
+      "step": 82
+    },
+    {
+      "epoch": 1.5741626794258372,
+      "grad_norm": 3.57423734664917,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 2.6948,
+      "step": 83
+    },
+    {
+      "epoch": 1.5933014354066986,
+      "grad_norm": 3.436167001724243,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 2.6287,
+      "step": 84
+    },
+    {
+      "epoch": 1.61244019138756,
+      "grad_norm": 3.1058871746063232,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 2.5887,
+      "step": 85
+    },
+    {
+      "epoch": 1.631578947368421,
+      "grad_norm": 2.1073200702667236,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 2.9573,
+      "step": 86
+    },
+    {
+      "epoch": 1.6507177033492821,
+      "grad_norm": 2.8039920330047607,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 2.9652,
+      "step": 87
+    },
+    {
+      "epoch": 1.6698564593301435,
+      "grad_norm": 2.8494677543640137,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 2.6263,
+      "step": 88
+    },
+    {
+      "epoch": 1.6889952153110048,
+      "grad_norm": 2.3521246910095215,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 2.683,
+      "step": 89
+    },
+    {
+      "epoch": 1.7081339712918662,
+      "grad_norm": 2.5750181674957275,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 2.611,
+      "step": 90
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 2.687619924545288,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 2.523,
+      "step": 91
+    },
+    {
+      "epoch": 1.7464114832535884,
+      "grad_norm": 3.112954616546631,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 2.6558,
+      "step": 92
+    },
+    {
+      "epoch": 1.7655502392344498,
+      "grad_norm": 3.4932713508605957,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 2.6962,
+      "step": 93
+    },
+    {
+      "epoch": 1.784688995215311,
+      "grad_norm": 2.564894437789917,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 2.8202,
+      "step": 94
+    },
+    {
+      "epoch": 1.8038277511961722,
+      "grad_norm": 3.1496503353118896,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 2.2734,
+      "step": 95
+    },
+    {
+      "epoch": 1.8229665071770333,
+      "grad_norm": 2.6274123191833496,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 2.5264,
+      "step": 96
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 2.374180555343628,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 2.8034,
+      "step": 97
+    },
+    {
+      "epoch": 1.861244019138756,
+      "grad_norm": 2.691254138946533,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 2.5935,
+      "step": 98
+    },
+    {
+      "epoch": 1.8803827751196174,
+      "grad_norm": 2.9795515537261963,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 2.8707,
+      "step": 99
+    },
+    {
+      "epoch": 1.8995215311004785,
+      "grad_norm": 3.1781864166259766,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 2.458,
+      "step": 100
+    },
+    {
+      "epoch": 1.9186602870813396,
+      "grad_norm": 2.8759453296661377,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 2.5201,
+      "step": 101
+    },
+    {
+      "epoch": 1.937799043062201,
+      "grad_norm": 3.2317118644714355,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 2.5585,
+      "step": 102
+    },
+    {
+      "epoch": 1.9569377990430623,
+      "grad_norm": 3.463688373565674,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 2.4652,
+      "step": 103
+    },
+    {
+      "epoch": 1.9760765550239234,
+      "grad_norm": 2.4766316413879395,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 2.5966,
+      "step": 104
+    },
+    {
+      "epoch": 1.9952153110047846,
+      "grad_norm": 2.8042709827423096,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 2.5278,
+      "step": 105
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.298028945922852,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 2.8732,
+      "step": 106
+    },
+    {
+      "epoch": 2.0191387559808613,
+      "grad_norm": 3.808393716812134,
+      "learning_rate": 0.000152669141192587,
+      "loss": 2.1442,
+      "step": 107
+    },
+    {
+      "epoch": 2.0382775119617227,
+      "grad_norm": 3.3381223678588867,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 2.6125,
+      "step": 108
+    },
+    {
+      "epoch": 2.0574162679425836,
+      "grad_norm": 4.778241157531738,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 2.399,
+      "step": 109
+    },
+    {
+      "epoch": 2.076555023923445,
+      "grad_norm": 2.613919973373413,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 2.5189,
+      "step": 110
+    },
+    {
+      "epoch": 2.0956937799043063,
+      "grad_norm": 3.6656932830810547,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 2.2785,
+      "step": 111
+    },
+    {
+      "epoch": 2.1148325358851676,
+      "grad_norm": 2.968078136444092,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 2.3605,
+      "step": 112
+    },
+    {
+      "epoch": 2.1339712918660285,
+      "grad_norm": 2.7252070903778076,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 2.173,
+      "step": 113
+    },
+    {
+      "epoch": 2.15311004784689,
+      "grad_norm": 3.9389491081237793,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 2.457,
+      "step": 114
+    },
+    {
+      "epoch": 2.172248803827751,
+      "grad_norm": 3.658862590789795,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 2.6971,
+      "step": 115
+    },
+    {
+      "epoch": 2.1913875598086126,
+      "grad_norm": 3.303403377532959,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 2.7851,
+      "step": 116
+    },
+    {
+      "epoch": 2.2105263157894735,
+      "grad_norm": 3.910428047180176,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 2.1422,
+      "step": 117
+    },
+    {
+      "epoch": 2.229665071770335,
+      "grad_norm": 3.3043367862701416,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 2.5404,
+      "step": 118
+    },
+    {
+      "epoch": 2.248803827751196,
+      "grad_norm": 2.9098987579345703,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 2.567,
+      "step": 119
+    },
+    {
+      "epoch": 2.2679425837320575,
+      "grad_norm": 4.142232894897461,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 2.4179,
+      "step": 120
+    },
+    {
+      "epoch": 2.287081339712919,
+      "grad_norm": 2.110104560852051,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 2.4976,
+      "step": 121
+    },
+    {
+      "epoch": 2.3062200956937797,
+      "grad_norm": 2.6828229427337646,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 2.5762,
+      "step": 122
+    },
+    {
+      "epoch": 2.325358851674641,
+      "grad_norm": 3.0066471099853516,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 2.2882,
+      "step": 123
+    },
+    {
+      "epoch": 2.3444976076555024,
+      "grad_norm": 3.791444778442383,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 2.1788,
+      "step": 124
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 2.78275203704834,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 2.6037,
+      "step": 125
+    },
+    {
+      "epoch": 2.382775119617225,
+      "grad_norm": 4.18953275680542,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 2.3284,
+      "step": 126
+    },
+    {
+      "epoch": 2.401913875598086,
+      "grad_norm": 2.925140142440796,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 2.4364,
+      "step": 127
+    },
+    {
+      "epoch": 2.4210526315789473,
+      "grad_norm": 4.545037746429443,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 2.4096,
+      "step": 128
+    },
+    {
+      "epoch": 2.4401913875598087,
+      "grad_norm": 3.785428524017334,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 2.1047,
+      "step": 129
+    },
+    {
+      "epoch": 2.45933014354067,
+      "grad_norm": 3.6228346824645996,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 2.5744,
+      "step": 130
+    },
+    {
+      "epoch": 2.478468899521531,
+      "grad_norm": 2.9221742153167725,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 2.5266,
+      "step": 131
+    },
+    {
+      "epoch": 2.4976076555023923,
+      "grad_norm": 3.659484386444092,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 2.3707,
+      "step": 132
+    },
+    {
+      "epoch": 2.5167464114832536,
+      "grad_norm": 3.5442514419555664,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 2.4695,
+      "step": 133
+    },
+    {
+      "epoch": 2.535885167464115,
+      "grad_norm": 3.1291420459747314,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 2.687,
+      "step": 134
+    },
+    {
+      "epoch": 2.555023923444976,
+      "grad_norm": 4.138225078582764,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 2.2378,
+      "step": 135
+    },
+    {
+      "epoch": 2.574162679425837,
+      "grad_norm": 2.8483548164367676,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 2.4044,
+      "step": 136
+    },
+    {
+      "epoch": 2.5933014354066986,
+      "grad_norm": 2.434741497039795,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 2.5664,
+      "step": 137
+    },
+    {
+      "epoch": 2.61244019138756,
+      "grad_norm": 3.9319725036621094,
+      "learning_rate": 0.000123117632211497,
+      "loss": 2.2586,
+      "step": 138
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 3.4802486896514893,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 2.0743,
+      "step": 139
+    },
+    {
+      "epoch": 2.650717703349282,
+      "grad_norm": 3.1535286903381348,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 2.302,
+      "step": 140
+    },
+    {
+      "epoch": 2.6698564593301435,
+      "grad_norm": 2.9818458557128906,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 2.4418,
+      "step": 141
+    },
+    {
+      "epoch": 2.688995215311005,
+      "grad_norm": 4.8768630027771,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 2.5083,
+      "step": 142
+    },
+    {
+      "epoch": 2.708133971291866,
+      "grad_norm": 3.8520619869232178,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.9994,
+      "step": 143
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 3.784248113632202,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 2.2376,
+      "step": 144
+    },
+    {
+      "epoch": 2.7464114832535884,
+      "grad_norm": 4.1650800704956055,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 2.3886,
+      "step": 145
+    },
+    {
+      "epoch": 2.7655502392344498,
+      "grad_norm": 4.099468231201172,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 2.5978,
+      "step": 146
+    },
+    {
+      "epoch": 2.784688995215311,
+      "grad_norm": 4.268674850463867,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 2.4882,
+      "step": 147
+    },
+    {
+      "epoch": 2.803827751196172,
+      "grad_norm": 4.081464767456055,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 2.6547,
+      "step": 148
+    },
+    {
+      "epoch": 2.8229665071770333,
+      "grad_norm": 3.1537716388702393,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 2.5361,
+      "step": 149
+    },
+    {
+      "epoch": 2.8421052631578947,
+      "grad_norm": 4.182295322418213,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 2.385,
+      "step": 150
+    },
+    {
+      "epoch": 2.861244019138756,
+      "grad_norm": 2.5511474609375,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 2.4617,
+      "step": 151
+    },
+    {
+      "epoch": 2.8803827751196174,
+      "grad_norm": 3.1007962226867676,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 2.5788,
+      "step": 152
+    },
+    {
+      "epoch": 2.8995215311004783,
+      "grad_norm": 4.509490966796875,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 2.1361,
+      "step": 153
+    },
+    {
+      "epoch": 2.9186602870813396,
+      "grad_norm": 2.6765851974487305,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 2.3922,
+      "step": 154
+    },
+    {
+      "epoch": 2.937799043062201,
+      "grad_norm": 3.704310894012451,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 2.6924,
+      "step": 155
+    },
+    {
+      "epoch": 2.9569377990430623,
+      "grad_norm": 3.935804843902588,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 2.2886,
+      "step": 156
+    },
+    {
+      "epoch": 2.9760765550239237,
+      "grad_norm": 4.105613708496094,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 2.3865,
+      "step": 157
+    },
+    {
+      "epoch": 2.9952153110047846,
+      "grad_norm": 3.669766664505005,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 2.2158,
+      "step": 158
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 8.930411338806152,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 2.5214,
+      "step": 159
+    },
+    {
+      "epoch": 3.0191387559808613,
+      "grad_norm": 3.3217484951019287,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 2.5409,
+      "step": 160
+    },
+    {
+      "epoch": 3.0382775119617227,
+      "grad_norm": 3.8041253089904785,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 1.8309,
+      "step": 161
+    },
+    {
+      "epoch": 3.0574162679425836,
+      "grad_norm": 3.892636775970459,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 2.0955,
+      "step": 162
+    },
+    {
+      "epoch": 3.076555023923445,
+      "grad_norm": 3.4822261333465576,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 2.2865,
+      "step": 163
+    },
+    {
+      "epoch": 3.0956937799043063,
+      "grad_norm": 3.033822774887085,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 2.3693,
+      "step": 164
+    },
+    {
+      "epoch": 3.1148325358851676,
+      "grad_norm": 3.693204641342163,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 1.984,
+      "step": 165
+    },
+    {
+      "epoch": 3.1339712918660285,
+      "grad_norm": 3.3877508640289307,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 2.0891,
+      "step": 166
+    },
+    {
+      "epoch": 3.15311004784689,
+      "grad_norm": 4.376189708709717,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 2.1974,
+      "step": 167
+    },
+    {
+      "epoch": 3.172248803827751,
+      "grad_norm": 3.557032823562622,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 1.9757,
+      "step": 168
+    },
+    {
+      "epoch": 3.1913875598086126,
+      "grad_norm": 2.733353853225708,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 2.4171,
+      "step": 169
+    },
+    {
+      "epoch": 3.2105263157894735,
+      "grad_norm": 2.7016165256500244,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 2.2992,
+      "step": 170
+    },
+    {
+      "epoch": 3.229665071770335,
+      "grad_norm": 3.997654438018799,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 2.1117,
+      "step": 171
+    },
+    {
+      "epoch": 3.248803827751196,
+      "grad_norm": 4.044878005981445,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 2.245,
+      "step": 172
+    },
+    {
+      "epoch": 3.2679425837320575,
+      "grad_norm": 3.080991506576538,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 2.1612,
+      "step": 173
+    },
+    {
+      "epoch": 3.287081339712919,
+      "grad_norm": 3.295807123184204,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 2.3217,
+      "step": 174
+    },
+    {
+      "epoch": 3.3062200956937797,
+      "grad_norm": 3.5904176235198975,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 2.3138,
+      "step": 175
+    },
+    {
+      "epoch": 3.325358851674641,
+      "grad_norm": 4.395754814147949,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.9074,
+      "step": 176
+    },
+    {
+      "epoch": 3.3444976076555024,
+      "grad_norm": 3.2221572399139404,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 2.2977,
+      "step": 177
+    },
+    {
+      "epoch": 3.3636363636363638,
+      "grad_norm": 2.9927215576171875,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 2.3785,
+      "step": 178
+    },
+    {
+      "epoch": 3.382775119617225,
+      "grad_norm": 3.9036011695861816,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 1.9962,
+      "step": 179
+    },
+    {
+      "epoch": 3.401913875598086,
+      "grad_norm": 4.485937595367432,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 1.7251,
+      "step": 180
+    },
+    {
+      "epoch": 3.4210526315789473,
+      "grad_norm": 4.750828742980957,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 2.1263,
+      "step": 181
+    },
+    {
+      "epoch": 3.4401913875598087,
+      "grad_norm": 4.138678550720215,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 2.0091,
+      "step": 182
+    },
+    {
+      "epoch": 3.45933014354067,
+      "grad_norm": 3.6726274490356445,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 2.3806,
+      "step": 183
+    },
+    {
+      "epoch": 3.478468899521531,
+      "grad_norm": 4.481295108795166,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 2.1338,
+      "step": 184
+    },
+    {
+      "epoch": 3.4976076555023923,
+      "grad_norm": 3.9401016235351562,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 2.1403,
+      "step": 185
+    },
+    {
+      "epoch": 3.5167464114832536,
+      "grad_norm": 4.227544784545898,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 2.3207,
+      "step": 186
+    },
+    {
+      "epoch": 3.535885167464115,
+      "grad_norm": 3.4885573387145996,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 2.2671,
+      "step": 187
+    },
+    {
+      "epoch": 3.555023923444976,
+      "grad_norm": 4.438218593597412,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 2.0748,
+      "step": 188
+    },
+    {
+      "epoch": 3.574162679425837,
+      "grad_norm": 3.766284465789795,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 2.5101,
+      "step": 189
+    },
+    {
+      "epoch": 3.5933014354066986,
+      "grad_norm": 4.027716636657715,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 2.3361,
+      "step": 190
+    },
+    {
+      "epoch": 3.61244019138756,
+      "grad_norm": 4.409999370574951,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 2.2492,
+      "step": 191
+    },
+    {
+      "epoch": 3.6315789473684212,
+      "grad_norm": 3.596459150314331,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 2.2855,
+      "step": 192
+    },
+    {
+      "epoch": 3.650717703349282,
+      "grad_norm": 4.667017459869385,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 2.083,
+      "step": 193
+    },
+    {
+      "epoch": 3.6698564593301435,
+      "grad_norm": 4.831173896789551,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 2.1568,
+      "step": 194
+    },
+    {
+      "epoch": 3.688995215311005,
+      "grad_norm": 3.5013201236724854,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 2.108,
+      "step": 195
+    },
+    {
+      "epoch": 3.708133971291866,
+      "grad_norm": 4.176932334899902,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 1.8315,
+      "step": 196
+    },
+    {
+      "epoch": 3.7272727272727275,
+      "grad_norm": 5.187565803527832,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 2.3541,
+      "step": 197
+    },
+    {
+      "epoch": 3.7464114832535884,
+      "grad_norm": 4.090083599090576,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 2.0691,
+      "step": 198
+    },
+    {
+      "epoch": 3.7655502392344498,
+      "grad_norm": 3.806030750274658,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 2.0471,
+      "step": 199
+    },
+    {
+      "epoch": 3.784688995215311,
+      "grad_norm": 4.668728828430176,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 2.335,
+      "step": 200
+    },
+    {
+      "epoch": 3.803827751196172,
+      "grad_norm": 4.811546325683594,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 2.2242,
+      "step": 201
+    },
+    {
+      "epoch": 3.8229665071770333,
+      "grad_norm": 5.359763145446777,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 2.3298,
+      "step": 202
+    },
+    {
+      "epoch": 3.8421052631578947,
+      "grad_norm": 3.5501046180725098,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 2.1297,
+      "step": 203
+    },
+    {
+      "epoch": 3.861244019138756,
+      "grad_norm": 3.4878952503204346,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 2.3615,
+      "step": 204
+    },
+    {
+      "epoch": 3.8803827751196174,
+      "grad_norm": 4.381737232208252,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 2.0084,
+      "step": 205
+    },
+    {
+      "epoch": 3.8995215311004783,
+      "grad_norm": 5.2298359870910645,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 2.362,
+      "step": 206
+    },
+    {
+      "epoch": 3.9186602870813396,
+      "grad_norm": 3.3613922595977783,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 2.1292,
+      "step": 207
+    },
+    {
+      "epoch": 3.937799043062201,
+      "grad_norm": 4.079115867614746,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 2.3561,
+      "step": 208
+    },
+    {
+      "epoch": 3.9569377990430623,
+      "grad_norm": 4.030163764953613,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 2.1954,
+      "step": 209
+    },
+    {
+      "epoch": 3.9760765550239237,
+      "grad_norm": 4.383935928344727,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 2.0793,
+      "step": 210
+    },
+    {
+      "epoch": 3.9952153110047846,
+      "grad_norm": 5.1565775871276855,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 2.0134,
+      "step": 211
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 8.855152130126953,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 2.4084,
+      "step": 212
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 593979339571200.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-212/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8d8f75328c89f0f8d97ecc3fb21f0a76fa9b188979afd06060c2f286d07806
+size 6456

checkpoint-212/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-265/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-0.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-265/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-0.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-265/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:874f5fe896254f14af5961b15081a3200be3ad8b456ae4be498e31a9cd311bd7
+size 8676008

checkpoint-265/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</thinking>": 151666,
+  "</tool_call>": 151658,
+  "<conspiracy>": 151668,
+  "<tangent>": 151667,
+  "<thinking>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-265/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-265/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0d1b5672052c231ac3fa243eb26edb29baf4521d02638f2e687acbdcb271f2b
+size 17415034

checkpoint-265/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
+size 14244

checkpoint-265/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08c1bdbf94a36542da6f9a56f3bfb435897976eab2da89a984e6406fba050177
+size 1064

checkpoint-265/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-265/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79d6915fdc377a021898a989de25f3e54ffcc6c1e9497f10812eb8a4504a7f01
+size 11422646

checkpoint-265/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<tangent>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<conspiracy>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<thinking>",
+    "</thinking>",
+    "<tangent>",
+    "<conspiracy>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-265/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1888 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 265,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019138755980861243,
+      "grad_norm": 3.7146408557891846,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 4.4869,
+      "step": 1
+    },
+    {
+      "epoch": 0.03827751196172249,
+      "grad_norm": 3.3118133544921875,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 4.1867,
+      "step": 2
+    },
+    {
+      "epoch": 0.05741626794258373,
+      "grad_norm": 2.972708225250244,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 4.001,
+      "step": 3
+    },
+    {
+      "epoch": 0.07655502392344497,
+      "grad_norm": 4.938202381134033,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 5.0582,
+      "step": 4
+    },
+    {
+      "epoch": 0.09569377990430622,
+      "grad_norm": 3.5732812881469727,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 4.5871,
+      "step": 5
+    },
+    {
+      "epoch": 0.11483253588516747,
+      "grad_norm": 3.350315570831299,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 4.0071,
+      "step": 6
+    },
+    {
+      "epoch": 0.1339712918660287,
+      "grad_norm": 3.4415643215179443,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 3.9791,
+      "step": 7
+    },
+    {
+      "epoch": 0.15311004784688995,
+      "grad_norm": 2.558781385421753,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 3.6497,
+      "step": 8
+    },
+    {
+      "epoch": 0.1722488038277512,
+      "grad_norm": 2.3021087646484375,
+      "learning_rate": 0.0002,
+      "loss": 3.5205,
+      "step": 9
+    },
+    {
+      "epoch": 0.19138755980861244,
+      "grad_norm": 2.301999568939209,
+      "learning_rate": 0.00019999462497359466,
+      "loss": 4.112,
+      "step": 10
+    },
+    {
+      "epoch": 0.21052631578947367,
+      "grad_norm": 3.0552637577056885,
+      "learning_rate": 0.0001999785004721968,
+      "loss": 3.8723,
+      "step": 11
+    },
+    {
+      "epoch": 0.22966507177033493,
+      "grad_norm": 2.5972537994384766,
+      "learning_rate": 0.00019995162822919883,
+      "loss": 3.8135,
+      "step": 12
+    },
+    {
+      "epoch": 0.24880382775119617,
+      "grad_norm": 2.0281920433044434,
+      "learning_rate": 0.00019991401113338104,
+      "loss": 3.8702,
+      "step": 13
+    },
+    {
+      "epoch": 0.2679425837320574,
+      "grad_norm": 1.7147849798202515,
+      "learning_rate": 0.00019986565322860115,
+      "loss": 3.463,
+      "step": 14
+    },
+    {
+      "epoch": 0.28708133971291866,
+      "grad_norm": 2.082582473754883,
+      "learning_rate": 0.00019980655971335945,
+      "loss": 3.3816,
+      "step": 15
+    },
+    {
+      "epoch": 0.3062200956937799,
+      "grad_norm": 2.1299426555633545,
+      "learning_rate": 0.00019973673694024,
+      "loss": 3.698,
+      "step": 16
+    },
+    {
+      "epoch": 0.3253588516746411,
+      "grad_norm": 1.8626389503479004,
+      "learning_rate": 0.0001996561924152278,
+      "loss": 3.3583,
+      "step": 17
+    },
+    {
+      "epoch": 0.3444976076555024,
+      "grad_norm": 2.452871322631836,
+      "learning_rate": 0.0001995649347969019,
+      "loss": 3.4957,
+      "step": 18
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 2.265108585357666,
+      "learning_rate": 0.00019946297389550433,
+      "loss": 3.1115,
+      "step": 19
+    },
+    {
+      "epoch": 0.3827751196172249,
+      "grad_norm": 1.996728777885437,
+      "learning_rate": 0.0001993503206718859,
+      "loss": 3.4159,
+      "step": 20
+    },
+    {
+      "epoch": 0.4019138755980861,
+      "grad_norm": 1.913594365119934,
+      "learning_rate": 0.00019922698723632767,
+      "loss": 3.3288,
+      "step": 21
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "grad_norm": 2.4316132068634033,
+      "learning_rate": 0.00019909298684723904,
+      "loss": 3.4245,
+      "step": 22
+    },
+    {
+      "epoch": 0.44019138755980863,
+      "grad_norm": 1.998693823814392,
+      "learning_rate": 0.00019894833390973266,
+      "loss": 3.1687,
+      "step": 23
+    },
+    {
+      "epoch": 0.45933014354066987,
+      "grad_norm": 2.21382737159729,
+      "learning_rate": 0.0001987930439740757,
+      "loss": 3.4307,
+      "step": 24
+    },
+    {
+      "epoch": 0.4784688995215311,
+      "grad_norm": 2.586013078689575,
+      "learning_rate": 0.0001986271337340182,
+      "loss": 3.3596,
+      "step": 25
+    },
+    {
+      "epoch": 0.49760765550239233,
+      "grad_norm": 2.8244550228118896,
+      "learning_rate": 0.0001984506210249986,
+      "loss": 3.3107,
+      "step": 26
+    },
+    {
+      "epoch": 0.5167464114832536,
+      "grad_norm": 2.0228700637817383,
+      "learning_rate": 0.00019826352482222638,
+      "loss": 3.1749,
+      "step": 27
+    },
+    {
+      "epoch": 0.5358851674641149,
+      "grad_norm": 2.7035820484161377,
+      "learning_rate": 0.0001980658652386421,
+      "loss": 3.0995,
+      "step": 28
+    },
+    {
+      "epoch": 0.5550239234449761,
+      "grad_norm": 2.119741916656494,
+      "learning_rate": 0.00019785766352275542,
+      "loss": 3.225,
+      "step": 29
+    },
+    {
+      "epoch": 0.5741626794258373,
+      "grad_norm": 2.5071310997009277,
+      "learning_rate": 0.00019763894205636072,
+      "loss": 3.0066,
+      "step": 30
+    },
+    {
+      "epoch": 0.5933014354066986,
+      "grad_norm": 2.992201566696167,
+      "learning_rate": 0.00019740972435213115,
+      "loss": 3.0412,
+      "step": 31
+    },
+    {
+      "epoch": 0.6124401913875598,
+      "grad_norm": 2.820875883102417,
+      "learning_rate": 0.00019717003505109095,
+      "loss": 3.3575,
+      "step": 32
+    },
+    {
+      "epoch": 0.631578947368421,
+      "grad_norm": 2.7096059322357178,
+      "learning_rate": 0.00019691989991996663,
+      "loss": 3.3531,
+      "step": 33
+    },
+    {
+      "epoch": 0.6507177033492823,
+      "grad_norm": 2.172783374786377,
+      "learning_rate": 0.00019665934584841682,
+      "loss": 3.2852,
+      "step": 34
+    },
+    {
+      "epoch": 0.6698564593301436,
+      "grad_norm": 3.238025188446045,
+      "learning_rate": 0.00019638840084614182,
+      "loss": 3.3818,
+      "step": 35
+    },
+    {
+      "epoch": 0.6889952153110048,
+      "grad_norm": 2.92851185798645,
+      "learning_rate": 0.00019610709403987246,
+      "loss": 3.1,
+      "step": 36
+    },
+    {
+      "epoch": 0.7081339712918661,
+      "grad_norm": 2.514800786972046,
+      "learning_rate": 0.000195815455670239,
+      "loss": 3.0883,
+      "step": 37
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 3.264613151550293,
+      "learning_rate": 0.0001955135170885202,
+      "loss": 2.9932,
+      "step": 38
+    },
+    {
+      "epoch": 0.7464114832535885,
+      "grad_norm": 2.4111247062683105,
+      "learning_rate": 0.00019520131075327298,
+      "loss": 2.974,
+      "step": 39
+    },
+    {
+      "epoch": 0.7655502392344498,
+      "grad_norm": 2.692473888397217,
+      "learning_rate": 0.00019487887022684336,
+      "loss": 2.8822,
+      "step": 40
+    },
+    {
+      "epoch": 0.784688995215311,
+      "grad_norm": 3.3863365650177,
+      "learning_rate": 0.00019454623017175812,
+      "loss": 3.0255,
+      "step": 41
+    },
+    {
+      "epoch": 0.8038277511961722,
+      "grad_norm": 2.2267720699310303,
+      "learning_rate": 0.0001942034263469989,
+      "loss": 3.1883,
+      "step": 42
+    },
+    {
+      "epoch": 0.8229665071770335,
+      "grad_norm": 2.31858491897583,
+      "learning_rate": 0.00019385049560415794,
+      "loss": 2.9882,
+      "step": 43
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 2.3098323345184326,
+      "learning_rate": 0.00019348747588347637,
+      "loss": 2.8801,
+      "step": 44
+    },
+    {
+      "epoch": 0.861244019138756,
+      "grad_norm": 3.3286585807800293,
+      "learning_rate": 0.00019311440620976597,
+      "loss": 2.9762,
+      "step": 45
+    },
+    {
+      "epoch": 0.8803827751196173,
+      "grad_norm": 3.1082146167755127,
+      "learning_rate": 0.00019273132668821364,
+      "loss": 2.8514,
+      "step": 46
+    },
+    {
+      "epoch": 0.8995215311004785,
+      "grad_norm": 2.2908411026000977,
+      "learning_rate": 0.00019233827850007027,
+      "loss": 3.2993,
+      "step": 47
+    },
+    {
+      "epoch": 0.9186602870813397,
+      "grad_norm": 2.1068387031555176,
+      "learning_rate": 0.00019193530389822363,
+      "loss": 3.0606,
+      "step": 48
+    },
+    {
+      "epoch": 0.937799043062201,
+      "grad_norm": 2.951885938644409,
+      "learning_rate": 0.0001915224462026563,
+      "loss": 3.042,
+      "step": 49
+    },
+    {
+      "epoch": 0.9569377990430622,
+      "grad_norm": 2.2476351261138916,
+      "learning_rate": 0.0001910997497957885,
+      "loss": 2.9928,
+      "step": 50
+    },
+    {
+      "epoch": 0.9760765550239234,
+      "grad_norm": 1.9801242351531982,
+      "learning_rate": 0.00019066726011770726,
+      "loss": 2.8911,
+      "step": 51
+    },
+    {
+      "epoch": 0.9952153110047847,
+      "grad_norm": 2.5246548652648926,
+      "learning_rate": 0.00019022502366128135,
+      "loss": 3.2457,
+      "step": 52
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.682666778564453,
+      "learning_rate": 0.0001897730879671634,
+      "loss": 2.4435,
+      "step": 53
+    },
+    {
+      "epoch": 1.0191387559808613,
+      "grad_norm": 2.66831374168396,
+      "learning_rate": 0.00018931150161867916,
+      "loss": 2.7807,
+      "step": 54
+    },
+    {
+      "epoch": 1.0382775119617225,
+      "grad_norm": 2.5246026515960693,
+      "learning_rate": 0.0001888403142366049,
+      "loss": 2.7599,
+      "step": 55
+    },
+    {
+      "epoch": 1.0574162679425838,
+      "grad_norm": 1.959625244140625,
+      "learning_rate": 0.00018835957647383303,
+      "loss": 2.9087,
+      "step": 56
+    },
+    {
+      "epoch": 1.076555023923445,
+      "grad_norm": 2.277261257171631,
+      "learning_rate": 0.00018786934000992688,
+      "loss": 3.2283,
+      "step": 57
+    },
+    {
+      "epoch": 1.0956937799043063,
+      "grad_norm": 3.0258898735046387,
+      "learning_rate": 0.00018736965754556528,
+      "loss": 3.0084,
+      "step": 58
+    },
+    {
+      "epoch": 1.1148325358851674,
+      "grad_norm": 2.4277517795562744,
+      "learning_rate": 0.00018686058279687698,
+      "loss": 2.7159,
+      "step": 59
+    },
+    {
+      "epoch": 1.1339712918660287,
+      "grad_norm": 3.0732321739196777,
+      "learning_rate": 0.00018634217048966637,
+      "loss": 2.9704,
+      "step": 60
+    },
+    {
+      "epoch": 1.1531100478468899,
+      "grad_norm": 3.0256996154785156,
+      "learning_rate": 0.0001858144763535302,
+      "loss": 3.0254,
+      "step": 61
+    },
+    {
+      "epoch": 1.1722488038277512,
+      "grad_norm": 2.7575695514678955,
+      "learning_rate": 0.00018527755711586678,
+      "loss": 2.4907,
+      "step": 62
+    },
+    {
+      "epoch": 1.1913875598086126,
+      "grad_norm": 2.813037157058716,
+      "learning_rate": 0.00018473147049577774,
+      "loss": 2.8598,
+      "step": 63
+    },
+    {
+      "epoch": 1.2105263157894737,
+      "grad_norm": 2.197244644165039,
+      "learning_rate": 0.00018417627519786315,
+      "loss": 2.965,
+      "step": 64
+    },
+    {
+      "epoch": 1.229665071770335,
+      "grad_norm": 2.0711350440979004,
+      "learning_rate": 0.00018361203090591071,
+      "loss": 2.9582,
+      "step": 65
+    },
+    {
+      "epoch": 1.2488038277511961,
+      "grad_norm": 2.7295780181884766,
+      "learning_rate": 0.00018303879827647975,
+      "loss": 2.5148,
+      "step": 66
+    },
+    {
+      "epoch": 1.2679425837320575,
+      "grad_norm": 2.511603593826294,
+      "learning_rate": 0.00018245663893238075,
+      "loss": 2.7535,
+      "step": 67
+    },
+    {
+      "epoch": 1.2870813397129186,
+      "grad_norm": 3.695086717605591,
+      "learning_rate": 0.00018186561545605054,
+      "loss": 2.6003,
+      "step": 68
+    },
+    {
+      "epoch": 1.30622009569378,
+      "grad_norm": 3.2395761013031006,
+      "learning_rate": 0.00018126579138282503,
+      "loss": 2.7334,
+      "step": 69
+    },
+    {
+      "epoch": 1.325358851674641,
+      "grad_norm": 3.004142999649048,
+      "learning_rate": 0.00018065723119410884,
+      "loss": 2.7788,
+      "step": 70
+    },
+    {
+      "epoch": 1.3444976076555024,
+      "grad_norm": 2.964301824569702,
+      "learning_rate": 0.0001800400003104436,
+      "loss": 2.734,
+      "step": 71
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 3.981093645095825,
+      "learning_rate": 0.00017941416508447536,
+      "loss": 2.7476,
+      "step": 72
+    },
+    {
+      "epoch": 1.3827751196172249,
+      "grad_norm": 3.2536420822143555,
+      "learning_rate": 0.00017877979279382135,
+      "loss": 2.5386,
+      "step": 73
+    },
+    {
+      "epoch": 1.401913875598086,
+      "grad_norm": 3.6163337230682373,
+      "learning_rate": 0.0001781369516338378,
+      "loss": 2.6767,
+      "step": 74
+    },
+    {
+      "epoch": 1.4210526315789473,
+      "grad_norm": 3.6883926391601562,
+      "learning_rate": 0.000177485710710289,
+      "loss": 2.5656,
+      "step": 75
+    },
+    {
+      "epoch": 1.4401913875598087,
+      "grad_norm": 3.5389018058776855,
+      "learning_rate": 0.00017682614003191807,
+      "loss": 2.6626,
+      "step": 76
+    },
+    {
+      "epoch": 1.4593301435406698,
+      "grad_norm": 2.324506998062134,
+      "learning_rate": 0.0001761583105029213,
+      "loss": 2.6479,
+      "step": 77
+    },
+    {
+      "epoch": 1.4784688995215312,
+      "grad_norm": 2.271515130996704,
+      "learning_rate": 0.00017548229391532572,
+      "loss": 2.874,
+      "step": 78
+    },
+    {
+      "epoch": 1.4976076555023923,
+      "grad_norm": 3.023533821105957,
+      "learning_rate": 0.00017479816294127152,
+      "loss": 2.4017,
+      "step": 79
+    },
+    {
+      "epoch": 1.5167464114832536,
+      "grad_norm": 4.101243495941162,
+      "learning_rate": 0.0001741059911251997,
+      "loss": 3.0185,
+      "step": 80
+    },
+    {
+      "epoch": 1.535885167464115,
+      "grad_norm": 3.056877374649048,
+      "learning_rate": 0.00017340585287594604,
+      "loss": 2.7875,
+      "step": 81
+    },
+    {
+      "epoch": 1.555023923444976,
+      "grad_norm": 3.0255823135375977,
+      "learning_rate": 0.00017269782345874203,
+      "loss": 2.8453,
+      "step": 82
+    },
+    {
+      "epoch": 1.5741626794258372,
+      "grad_norm": 3.57423734664917,
+      "learning_rate": 0.00017198197898712404,
+      "loss": 2.6948,
+      "step": 83
+    },
+    {
+      "epoch": 1.5933014354066986,
+      "grad_norm": 3.436167001724243,
+      "learning_rate": 0.00017125839641475072,
+      "loss": 2.6287,
+      "step": 84
+    },
+    {
+      "epoch": 1.61244019138756,
+      "grad_norm": 3.1058871746063232,
+      "learning_rate": 0.00017052715352713075,
+      "loss": 2.5887,
+      "step": 85
+    },
+    {
+      "epoch": 1.631578947368421,
+      "grad_norm": 2.1073200702667236,
+      "learning_rate": 0.00016978832893326074,
+      "loss": 2.9573,
+      "step": 86
+    },
+    {
+      "epoch": 1.6507177033492821,
+      "grad_norm": 2.8039920330047607,
+      "learning_rate": 0.0001690420020571747,
+      "loss": 2.9652,
+      "step": 87
+    },
+    {
+      "epoch": 1.6698564593301435,
+      "grad_norm": 2.8494677543640137,
+      "learning_rate": 0.00016828825312940592,
+      "loss": 2.6263,
+      "step": 88
+    },
+    {
+      "epoch": 1.6889952153110048,
+      "grad_norm": 2.3521246910095215,
+      "learning_rate": 0.00016752716317836229,
+      "loss": 2.683,
+      "step": 89
+    },
+    {
+      "epoch": 1.7081339712918662,
+      "grad_norm": 2.5750181674957275,
+      "learning_rate": 0.00016675881402161536,
+      "loss": 2.611,
+      "step": 90
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 2.687619924545288,
+      "learning_rate": 0.00016598328825710533,
+      "loss": 2.523,
+      "step": 91
+    },
+    {
+      "epoch": 1.7464114832535884,
+      "grad_norm": 3.112954616546631,
+      "learning_rate": 0.00016520066925426144,
+      "loss": 2.6558,
+      "step": 92
+    },
+    {
+      "epoch": 1.7655502392344498,
+      "grad_norm": 3.4932713508605957,
+      "learning_rate": 0.0001644110411450398,
+      "loss": 2.6962,
+      "step": 93
+    },
+    {
+      "epoch": 1.784688995215311,
+      "grad_norm": 2.564894437789917,
+      "learning_rate": 0.00016361448881487914,
+      "loss": 2.8202,
+      "step": 94
+    },
+    {
+      "epoch": 1.8038277511961722,
+      "grad_norm": 3.1496503353118896,
+      "learning_rate": 0.0001628110978935756,
+      "loss": 2.2734,
+      "step": 95
+    },
+    {
+      "epoch": 1.8229665071770333,
+      "grad_norm": 2.6274123191833496,
+      "learning_rate": 0.00016200095474607753,
+      "loss": 2.5264,
+      "step": 96
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 2.374180555343628,
+      "learning_rate": 0.0001611841464632011,
+      "loss": 2.8034,
+      "step": 97
+    },
+    {
+      "epoch": 1.861244019138756,
+      "grad_norm": 2.691254138946533,
+      "learning_rate": 0.00016036076085226814,
+      "loss": 2.5935,
+      "step": 98
+    },
+    {
+      "epoch": 1.8803827751196174,
+      "grad_norm": 2.9795515537261963,
+      "learning_rate": 0.0001595308864276666,
+      "loss": 2.8707,
+      "step": 99
+    },
+    {
+      "epoch": 1.8995215311004785,
+      "grad_norm": 3.1781864166259766,
+      "learning_rate": 0.0001586946124013354,
+      "loss": 2.458,
+      "step": 100
+    },
+    {
+      "epoch": 1.9186602870813396,
+      "grad_norm": 2.8759453296661377,
+      "learning_rate": 0.00015785202867317407,
+      "loss": 2.5201,
+      "step": 101
+    },
+    {
+      "epoch": 1.937799043062201,
+      "grad_norm": 3.2317118644714355,
+      "learning_rate": 0.00015700322582137827,
+      "loss": 2.5585,
+      "step": 102
+    },
+    {
+      "epoch": 1.9569377990430623,
+      "grad_norm": 3.463688373565674,
+      "learning_rate": 0.0001561482950927029,
+      "loss": 2.4652,
+      "step": 103
+    },
+    {
+      "epoch": 1.9760765550239234,
+      "grad_norm": 2.4766316413879395,
+      "learning_rate": 0.00015528732839265272,
+      "loss": 2.5966,
+      "step": 104
+    },
+    {
+      "epoch": 1.9952153110047846,
+      "grad_norm": 2.8042709827423096,
+      "learning_rate": 0.00015442041827560274,
+      "loss": 2.5278,
+      "step": 105
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 8.298028945922852,
+      "learning_rate": 0.00015354765793484834,
+      "loss": 2.8732,
+      "step": 106
+    },
+    {
+      "epoch": 2.0191387559808613,
+      "grad_norm": 3.808393716812134,
+      "learning_rate": 0.000152669141192587,
+      "loss": 2.1442,
+      "step": 107
+    },
+    {
+      "epoch": 2.0382775119617227,
+      "grad_norm": 3.3381223678588867,
+      "learning_rate": 0.00015178496248983254,
+      "loss": 2.6125,
+      "step": 108
+    },
+    {
+      "epoch": 2.0574162679425836,
+      "grad_norm": 4.778241157531738,
+      "learning_rate": 0.00015089521687626243,
+      "loss": 2.399,
+      "step": 109
+    },
+    {
+      "epoch": 2.076555023923445,
+      "grad_norm": 2.613919973373413,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 2.5189,
+      "step": 110
+    },
+    {
+      "epoch": 2.0956937799043063,
+      "grad_norm": 3.6656932830810547,
+      "learning_rate": 0.00014909940809733222,
+      "loss": 2.2785,
+      "step": 111
+    },
+    {
+      "epoch": 2.1148325358851676,
+      "grad_norm": 2.968078136444092,
+      "learning_rate": 0.00014819353798236427,
+      "loss": 2.3605,
+      "step": 112
+    },
+    {
+      "epoch": 2.1339712918660285,
+      "grad_norm": 2.7252070903778076,
+      "learning_rate": 0.00014728248703661182,
+      "loss": 2.173,
+      "step": 113
+    },
+    {
+      "epoch": 2.15311004784689,
+      "grad_norm": 3.9389491081237793,
+      "learning_rate": 0.00014636635319853275,
+      "loss": 2.457,
+      "step": 114
+    },
+    {
+      "epoch": 2.172248803827751,
+      "grad_norm": 3.658862590789795,
+      "learning_rate": 0.00014544523495299842,
+      "loss": 2.6971,
+      "step": 115
+    },
+    {
+      "epoch": 2.1913875598086126,
+      "grad_norm": 3.303403377532959,
+      "learning_rate": 0.0001445192313207067,
+      "loss": 2.7851,
+      "step": 116
+    },
+    {
+      "epoch": 2.2105263157894735,
+      "grad_norm": 3.910428047180176,
+      "learning_rate": 0.00014358844184753712,
+      "loss": 2.1422,
+      "step": 117
+    },
+    {
+      "epoch": 2.229665071770335,
+      "grad_norm": 3.3043367862701416,
+      "learning_rate": 0.00014265296659384956,
+      "loss": 2.5404,
+      "step": 118
+    },
+    {
+      "epoch": 2.248803827751196,
+      "grad_norm": 2.9098987579345703,
+      "learning_rate": 0.0001417129061237278,
+      "loss": 2.567,
+      "step": 119
+    },
+    {
+      "epoch": 2.2679425837320575,
+      "grad_norm": 4.142232894897461,
+      "learning_rate": 0.00014076836149416887,
+      "loss": 2.4179,
+      "step": 120
+    },
+    {
+      "epoch": 2.287081339712919,
+      "grad_norm": 2.110104560852051,
+      "learning_rate": 0.00013981943424421932,
+      "loss": 2.4976,
+      "step": 121
+    },
+    {
+      "epoch": 2.3062200956937797,
+      "grad_norm": 2.6828229427337646,
+      "learning_rate": 0.00013886622638405952,
+      "loss": 2.5762,
+      "step": 122
+    },
+    {
+      "epoch": 2.325358851674641,
+      "grad_norm": 3.0066471099853516,
+      "learning_rate": 0.00013790884038403795,
+      "loss": 2.2882,
+      "step": 123
+    },
+    {
+      "epoch": 2.3444976076555024,
+      "grad_norm": 3.791444778442383,
+      "learning_rate": 0.00013694737916365517,
+      "loss": 2.1788,
+      "step": 124
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 2.78275203704834,
+      "learning_rate": 0.0001359819460805001,
+      "loss": 2.6037,
+      "step": 125
+    },
+    {
+      "epoch": 2.382775119617225,
+      "grad_norm": 4.18953275680542,
+      "learning_rate": 0.00013501264491913906,
+      "loss": 2.3284,
+      "step": 126
+    },
+    {
+      "epoch": 2.401913875598086,
+      "grad_norm": 2.925140142440796,
+      "learning_rate": 0.00013403957987995882,
+      "loss": 2.4364,
+      "step": 127
+    },
+    {
+      "epoch": 2.4210526315789473,
+      "grad_norm": 4.545037746429443,
+      "learning_rate": 0.00013306285556796495,
+      "loss": 2.4096,
+      "step": 128
+    },
+    {
+      "epoch": 2.4401913875598087,
+      "grad_norm": 3.785428524017334,
+      "learning_rate": 0.00013208257698153677,
+      "loss": 2.1047,
+      "step": 129
+    },
+    {
+      "epoch": 2.45933014354067,
+      "grad_norm": 3.6228346824645996,
+      "learning_rate": 0.00013109884950114007,
+      "loss": 2.5744,
+      "step": 130
+    },
+    {
+      "epoch": 2.478468899521531,
+      "grad_norm": 2.9221742153167725,
+      "learning_rate": 0.00013011177887799845,
+      "loss": 2.5266,
+      "step": 131
+    },
+    {
+      "epoch": 2.4976076555023923,
+      "grad_norm": 3.659484386444092,
+      "learning_rate": 0.00012912147122272523,
+      "loss": 2.3707,
+      "step": 132
+    },
+    {
+      "epoch": 2.5167464114832536,
+      "grad_norm": 3.5442514419555664,
+      "learning_rate": 0.00012812803299391628,
+      "loss": 2.4695,
+      "step": 133
+    },
+    {
+      "epoch": 2.535885167464115,
+      "grad_norm": 3.1291420459747314,
+      "learning_rate": 0.0001271315709867059,
+      "loss": 2.687,
+      "step": 134
+    },
+    {
+      "epoch": 2.555023923444976,
+      "grad_norm": 4.138225078582764,
+      "learning_rate": 0.00012613219232128608,
+      "loss": 2.2378,
+      "step": 135
+    },
+    {
+      "epoch": 2.574162679425837,
+      "grad_norm": 2.8483548164367676,
+      "learning_rate": 0.00012513000443139112,
+      "loss": 2.4044,
+      "step": 136
+    },
+    {
+      "epoch": 2.5933014354066986,
+      "grad_norm": 2.434741497039795,
+      "learning_rate": 0.00012412511505274844,
+      "loss": 2.5664,
+      "step": 137
+    },
+    {
+      "epoch": 2.61244019138756,
+      "grad_norm": 3.9319725036621094,
+      "learning_rate": 0.000123117632211497,
+      "loss": 2.2586,
+      "step": 138
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 3.4802486896514893,
+      "learning_rate": 0.0001221076642125742,
+      "loss": 2.0743,
+      "step": 139
+    },
+    {
+      "epoch": 2.650717703349282,
+      "grad_norm": 3.1535286903381348,
+      "learning_rate": 0.00012109531962807332,
+      "loss": 2.302,
+      "step": 140
+    },
+    {
+      "epoch": 2.6698564593301435,
+      "grad_norm": 2.9818458557128906,
+      "learning_rate": 0.00012008070728557186,
+      "loss": 2.4418,
+      "step": 141
+    },
+    {
+      "epoch": 2.688995215311005,
+      "grad_norm": 4.8768630027771,
+      "learning_rate": 0.00011906393625643244,
+      "loss": 2.5083,
+      "step": 142
+    },
+    {
+      "epoch": 2.708133971291866,
+      "grad_norm": 3.8520619869232178,
+      "learning_rate": 0.00011804511584407763,
+      "loss": 1.9994,
+      "step": 143
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 3.784248113632202,
+      "learning_rate": 0.00011702435557223987,
+      "loss": 2.2376,
+      "step": 144
+    },
+    {
+      "epoch": 2.7464114832535884,
+      "grad_norm": 4.1650800704956055,
+      "learning_rate": 0.00011600176517318741,
+      "loss": 2.3886,
+      "step": 145
+    },
+    {
+      "epoch": 2.7655502392344498,
+      "grad_norm": 4.099468231201172,
+      "learning_rate": 0.00011497745457592816,
+      "loss": 2.5978,
+      "step": 146
+    },
+    {
+      "epoch": 2.784688995215311,
+      "grad_norm": 4.268674850463867,
+      "learning_rate": 0.00011395153389439233,
+      "loss": 2.4882,
+      "step": 147
+    },
+    {
+      "epoch": 2.803827751196172,
+      "grad_norm": 4.081464767456055,
+      "learning_rate": 0.0001129241134155949,
+      "loss": 2.6547,
+      "step": 148
+    },
+    {
+      "epoch": 2.8229665071770333,
+      "grad_norm": 3.1537716388702393,
+      "learning_rate": 0.00011189530358778005,
+      "loss": 2.5361,
+      "step": 149
+    },
+    {
+      "epoch": 2.8421052631578947,
+      "grad_norm": 4.182295322418213,
+      "learning_rate": 0.00011086521500854745,
+      "loss": 2.385,
+      "step": 150
+    },
+    {
+      "epoch": 2.861244019138756,
+      "grad_norm": 2.5511474609375,
+      "learning_rate": 0.00010983395841296348,
+      "loss": 2.4617,
+      "step": 151
+    },
+    {
+      "epoch": 2.8803827751196174,
+      "grad_norm": 3.1007962226867676,
+      "learning_rate": 0.00010880164466165674,
+      "loss": 2.5788,
+      "step": 152
+    },
+    {
+      "epoch": 2.8995215311004783,
+      "grad_norm": 4.509490966796875,
+      "learning_rate": 0.00010776838472890065,
+      "loss": 2.1361,
+      "step": 153
+    },
+    {
+      "epoch": 2.9186602870813396,
+      "grad_norm": 2.6765851974487305,
+      "learning_rate": 0.00010673428969068364,
+      "loss": 2.3922,
+      "step": 154
+    },
+    {
+      "epoch": 2.937799043062201,
+      "grad_norm": 3.704310894012451,
+      "learning_rate": 0.00010569947071276847,
+      "loss": 2.6924,
+      "step": 155
+    },
+    {
+      "epoch": 2.9569377990430623,
+      "grad_norm": 3.935804843902588,
+      "learning_rate": 0.00010466403903874176,
+      "loss": 2.2886,
+      "step": 156
+    },
+    {
+      "epoch": 2.9760765550239237,
+      "grad_norm": 4.105613708496094,
+      "learning_rate": 0.00010362810597805526,
+      "loss": 2.3865,
+      "step": 157
+    },
+    {
+      "epoch": 2.9952153110047846,
+      "grad_norm": 3.669766664505005,
+      "learning_rate": 0.00010259178289406011,
+      "loss": 2.2158,
+      "step": 158
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 8.930411338806152,
+      "learning_rate": 0.0001015551811920351,
+      "loss": 2.5214,
+      "step": 159
+    },
+    {
+      "epoch": 3.0191387559808613,
+      "grad_norm": 3.3217484951019287,
+      "learning_rate": 0.00010051841230721065,
+      "loss": 2.5409,
+      "step": 160
+    },
+    {
+      "epoch": 3.0382775119617227,
+      "grad_norm": 3.8041253089904785,
+      "learning_rate": 9.948158769278939e-05,
+      "loss": 1.8309,
+      "step": 161
+    },
+    {
+      "epoch": 3.0574162679425836,
+      "grad_norm": 3.892636775970459,
+      "learning_rate": 9.844481880796491e-05,
+      "loss": 2.0955,
+      "step": 162
+    },
+    {
+      "epoch": 3.076555023923445,
+      "grad_norm": 3.4822261333465576,
+      "learning_rate": 9.740821710593989e-05,
+      "loss": 2.2865,
+      "step": 163
+    },
+    {
+      "epoch": 3.0956937799043063,
+      "grad_norm": 3.033822774887085,
+      "learning_rate": 9.637189402194476e-05,
+      "loss": 2.3693,
+      "step": 164
+    },
+    {
+      "epoch": 3.1148325358851676,
+      "grad_norm": 3.693204641342163,
+      "learning_rate": 9.533596096125825e-05,
+      "loss": 1.984,
+      "step": 165
+    },
+    {
+      "epoch": 3.1339712918660285,
+      "grad_norm": 3.3877508640289307,
+      "learning_rate": 9.430052928723153e-05,
+      "loss": 2.0891,
+      "step": 166
+    },
+    {
+      "epoch": 3.15311004784689,
+      "grad_norm": 4.376189708709717,
+      "learning_rate": 9.326571030931637e-05,
+      "loss": 2.1974,
+      "step": 167
+    },
+    {
+      "epoch": 3.172248803827751,
+      "grad_norm": 3.557032823562622,
+      "learning_rate": 9.223161527109937e-05,
+      "loss": 1.9757,
+      "step": 168
+    },
+    {
+      "epoch": 3.1913875598086126,
+      "grad_norm": 2.733353853225708,
+      "learning_rate": 9.119835533834331e-05,
+      "loss": 2.4171,
+      "step": 169
+    },
+    {
+      "epoch": 3.2105263157894735,
+      "grad_norm": 2.7016165256500244,
+      "learning_rate": 9.016604158703654e-05,
+      "loss": 2.2992,
+      "step": 170
+    },
+    {
+      "epoch": 3.229665071770335,
+      "grad_norm": 3.997654438018799,
+      "learning_rate": 8.913478499145254e-05,
+      "loss": 2.1117,
+      "step": 171
+    },
+    {
+      "epoch": 3.248803827751196,
+      "grad_norm": 4.044878005981445,
+      "learning_rate": 8.810469641222001e-05,
+      "loss": 2.245,
+      "step": 172
+    },
+    {
+      "epoch": 3.2679425837320575,
+      "grad_norm": 3.080991506576538,
+      "learning_rate": 8.707588658440511e-05,
+      "loss": 2.1612,
+      "step": 173
+    },
+    {
+      "epoch": 3.287081339712919,
+      "grad_norm": 3.295807123184204,
+      "learning_rate": 8.604846610560771e-05,
+      "loss": 2.3217,
+      "step": 174
+    },
+    {
+      "epoch": 3.3062200956937797,
+      "grad_norm": 3.5904176235198975,
+      "learning_rate": 8.502254542407186e-05,
+      "loss": 2.3138,
+      "step": 175
+    },
+    {
+      "epoch": 3.325358851674641,
+      "grad_norm": 4.395754814147949,
+      "learning_rate": 8.399823482681262e-05,
+      "loss": 1.9074,
+      "step": 176
+    },
+    {
+      "epoch": 3.3444976076555024,
+      "grad_norm": 3.2221572399139404,
+      "learning_rate": 8.297564442776014e-05,
+      "loss": 2.2977,
+      "step": 177
+    },
+    {
+      "epoch": 3.3636363636363638,
+      "grad_norm": 2.9927215576171875,
+      "learning_rate": 8.195488415592238e-05,
+      "loss": 2.3785,
+      "step": 178
+    },
+    {
+      "epoch": 3.382775119617225,
+      "grad_norm": 3.9036011695861816,
+      "learning_rate": 8.093606374356759e-05,
+      "loss": 1.9962,
+      "step": 179
+    },
+    {
+      "epoch": 3.401913875598086,
+      "grad_norm": 4.485937595367432,
+      "learning_rate": 7.991929271442817e-05,
+      "loss": 1.7251,
+      "step": 180
+    },
+    {
+      "epoch": 3.4210526315789473,
+      "grad_norm": 4.750828742980957,
+      "learning_rate": 7.89046803719267e-05,
+      "loss": 2.1263,
+      "step": 181
+    },
+    {
+      "epoch": 3.4401913875598087,
+      "grad_norm": 4.138678550720215,
+      "learning_rate": 7.789233578742582e-05,
+      "loss": 2.0091,
+      "step": 182
+    },
+    {
+      "epoch": 3.45933014354067,
+      "grad_norm": 3.6726274490356445,
+      "learning_rate": 7.688236778850306e-05,
+      "loss": 2.3806,
+      "step": 183
+    },
+    {
+      "epoch": 3.478468899521531,
+      "grad_norm": 4.481295108795166,
+      "learning_rate": 7.587488494725157e-05,
+      "loss": 2.1338,
+      "step": 184
+    },
+    {
+      "epoch": 3.4976076555023923,
+      "grad_norm": 3.9401016235351562,
+      "learning_rate": 7.48699955686089e-05,
+      "loss": 2.1403,
+      "step": 185
+    },
+    {
+      "epoch": 3.5167464114832536,
+      "grad_norm": 4.227544784545898,
+      "learning_rate": 7.386780767871397e-05,
+      "loss": 2.3207,
+      "step": 186
+    },
+    {
+      "epoch": 3.535885167464115,
+      "grad_norm": 3.4885573387145996,
+      "learning_rate": 7.286842901329412e-05,
+      "loss": 2.2671,
+      "step": 187
+    },
+    {
+      "epoch": 3.555023923444976,
+      "grad_norm": 4.438218593597412,
+      "learning_rate": 7.187196700608373e-05,
+      "loss": 2.0748,
+      "step": 188
+    },
+    {
+      "epoch": 3.574162679425837,
+      "grad_norm": 3.766284465789795,
+      "learning_rate": 7.087852877727481e-05,
+      "loss": 2.5101,
+      "step": 189
+    },
+    {
+      "epoch": 3.5933014354066986,
+      "grad_norm": 4.027716636657715,
+      "learning_rate": 6.988822112200156e-05,
+      "loss": 2.3361,
+      "step": 190
+    },
+    {
+      "epoch": 3.61244019138756,
+      "grad_norm": 4.409999370574951,
+      "learning_rate": 6.890115049885994e-05,
+      "loss": 2.2492,
+      "step": 191
+    },
+    {
+      "epoch": 3.6315789473684212,
+      "grad_norm": 3.596459150314331,
+      "learning_rate": 6.791742301846326e-05,
+      "loss": 2.2855,
+      "step": 192
+    },
+    {
+      "epoch": 3.650717703349282,
+      "grad_norm": 4.667017459869385,
+      "learning_rate": 6.693714443203507e-05,
+      "loss": 2.083,
+      "step": 193
+    },
+    {
+      "epoch": 3.6698564593301435,
+      "grad_norm": 4.831173896789551,
+      "learning_rate": 6.59604201200412e-05,
+      "loss": 2.1568,
+      "step": 194
+    },
+    {
+      "epoch": 3.688995215311005,
+      "grad_norm": 3.5013201236724854,
+      "learning_rate": 6.498735508086093e-05,
+      "loss": 2.108,
+      "step": 195
+    },
+    {
+      "epoch": 3.708133971291866,
+      "grad_norm": 4.176932334899902,
+      "learning_rate": 6.40180539194999e-05,
+      "loss": 1.8315,
+      "step": 196
+    },
+    {
+      "epoch": 3.7272727272727275,
+      "grad_norm": 5.187565803527832,
+      "learning_rate": 6.305262083634488e-05,
+      "loss": 2.3541,
+      "step": 197
+    },
+    {
+      "epoch": 3.7464114832535884,
+      "grad_norm": 4.090083599090576,
+      "learning_rate": 6.209115961596208e-05,
+      "loss": 2.0691,
+      "step": 198
+    },
+    {
+      "epoch": 3.7655502392344498,
+      "grad_norm": 3.806030750274658,
+      "learning_rate": 6.113377361594049e-05,
+      "loss": 2.0471,
+      "step": 199
+    },
+    {
+      "epoch": 3.784688995215311,
+      "grad_norm": 4.668728828430176,
+      "learning_rate": 6.018056575578075e-05,
+      "loss": 2.335,
+      "step": 200
+    },
+    {
+      "epoch": 3.803827751196172,
+      "grad_norm": 4.811546325683594,
+      "learning_rate": 5.923163850583113e-05,
+      "loss": 2.2242,
+      "step": 201
+    },
+    {
+      "epoch": 3.8229665071770333,
+      "grad_norm": 5.359763145446777,
+      "learning_rate": 5.828709387627218e-05,
+      "loss": 2.3298,
+      "step": 202
+    },
+    {
+      "epoch": 3.8421052631578947,
+      "grad_norm": 3.5501046180725098,
+      "learning_rate": 5.73470334061505e-05,
+      "loss": 2.1297,
+      "step": 203
+    },
+    {
+      "epoch": 3.861244019138756,
+      "grad_norm": 3.4878952503204346,
+      "learning_rate": 5.6411558152462894e-05,
+      "loss": 2.3615,
+      "step": 204
+    },
+    {
+      "epoch": 3.8803827751196174,
+      "grad_norm": 4.381737232208252,
+      "learning_rate": 5.54807686792933e-05,
+      "loss": 2.0084,
+      "step": 205
+    },
+    {
+      "epoch": 3.8995215311004783,
+      "grad_norm": 5.2298359870910645,
+      "learning_rate": 5.4554765047001613e-05,
+      "loss": 2.362,
+      "step": 206
+    },
+    {
+      "epoch": 3.9186602870813396,
+      "grad_norm": 3.3613922595977783,
+      "learning_rate": 5.363364680146725e-05,
+      "loss": 2.1292,
+      "step": 207
+    },
+    {
+      "epoch": 3.937799043062201,
+      "grad_norm": 4.079115867614746,
+      "learning_rate": 5.271751296338823e-05,
+      "loss": 2.3561,
+      "step": 208
+    },
+    {
+      "epoch": 3.9569377990430623,
+      "grad_norm": 4.030163764953613,
+      "learning_rate": 5.180646201763577e-05,
+      "loss": 2.1954,
+      "step": 209
+    },
+    {
+      "epoch": 3.9760765550239237,
+      "grad_norm": 4.383935928344727,
+      "learning_rate": 5.090059190266779e-05,
+      "loss": 2.0793,
+      "step": 210
+    },
+    {
+      "epoch": 3.9952153110047846,
+      "grad_norm": 5.1565775871276855,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 2.0134,
+      "step": 211
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 8.855152130126953,
+      "learning_rate": 4.9104783123737566e-05,
+      "loss": 2.4084,
+      "step": 212
+    },
+    {
+      "epoch": 4.019138755980861,
+      "grad_norm": 3.997187614440918,
+      "learning_rate": 4.821503751016746e-05,
+      "loss": 1.978,
+      "step": 213
+    },
+    {
+      "epoch": 4.038277511961723,
+      "grad_norm": 5.068262100219727,
+      "learning_rate": 4.733085880741301e-05,
+      "loss": 2.199,
+      "step": 214
+    },
+    {
+      "epoch": 4.057416267942584,
+      "grad_norm": 3.602715015411377,
+      "learning_rate": 4.645234206515171e-05,
+      "loss": 2.0417,
+      "step": 215
+    },
+    {
+      "epoch": 4.076555023923445,
+      "grad_norm": 4.461487293243408,
+      "learning_rate": 4.5579581724397255e-05,
+      "loss": 1.7777,
+      "step": 216
+    },
+    {
+      "epoch": 4.095693779904306,
+      "grad_norm": 3.1041159629821777,
+      "learning_rate": 4.471267160734731e-05,
+      "loss": 2.1061,
+      "step": 217
+    },
+    {
+      "epoch": 4.114832535885167,
+      "grad_norm": 3.8727328777313232,
+      "learning_rate": 4.385170490729712e-05,
+      "loss": 2.4006,
+      "step": 218
+    },
+    {
+      "epoch": 4.133971291866029,
+      "grad_norm": 3.868997097015381,
+      "learning_rate": 4.2996774178621736e-05,
+      "loss": 2.0803,
+      "step": 219
+    },
+    {
+      "epoch": 4.15311004784689,
+      "grad_norm": 3.5627689361572266,
+      "learning_rate": 4.2147971326825966e-05,
+      "loss": 2.0853,
+      "step": 220
+    },
+    {
+      "epoch": 4.172248803827751,
+      "grad_norm": 4.383954048156738,
+      "learning_rate": 4.130538759866457e-05,
+      "loss": 1.7176,
+      "step": 221
+    },
+    {
+      "epoch": 4.1913875598086126,
+      "grad_norm": 4.526845932006836,
+      "learning_rate": 4.046911357233343e-05,
+      "loss": 2.0527,
+      "step": 222
+    },
+    {
+      "epoch": 4.2105263157894735,
+      "grad_norm": 2.923349380493164,
+      "learning_rate": 3.963923914773187e-05,
+      "loss": 2.1541,
+      "step": 223
+    },
+    {
+      "epoch": 4.229665071770335,
+      "grad_norm": 4.575229167938232,
+      "learning_rate": 3.8815853536798904e-05,
+      "loss": 1.986,
+      "step": 224
+    },
+    {
+      "epoch": 4.248803827751196,
+      "grad_norm": 3.529787540435791,
+      "learning_rate": 3.79990452539225e-05,
+      "loss": 2.0539,
+      "step": 225
+    },
+    {
+      "epoch": 4.267942583732057,
+      "grad_norm": 4.581504821777344,
+      "learning_rate": 3.7188902106424416e-05,
+      "loss": 1.9526,
+      "step": 226
+    },
+    {
+      "epoch": 4.287081339712919,
+      "grad_norm": 3.2781484127044678,
+      "learning_rate": 3.638551118512089e-05,
+      "loss": 2.1485,
+      "step": 227
+    },
+    {
+      "epoch": 4.30622009569378,
+      "grad_norm": 3.7174124717712402,
+      "learning_rate": 3.558895885496023e-05,
+      "loss": 2.3293,
+      "step": 228
+    },
+    {
+      "epoch": 4.3253588516746415,
+      "grad_norm": 4.923449993133545,
+      "learning_rate": 3.479933074573858e-05,
+      "loss": 2.0144,
+      "step": 229
+    },
+    {
+      "epoch": 4.344497607655502,
+      "grad_norm": 4.567214488983154,
+      "learning_rate": 3.401671174289469e-05,
+      "loss": 1.8614,
+      "step": 230
+    },
+    {
+      "epoch": 4.363636363636363,
+      "grad_norm": 3.7625460624694824,
+      "learning_rate": 3.324118597838464e-05,
+      "loss": 2.1933,
+      "step": 231
+    },
+    {
+      "epoch": 4.382775119617225,
+      "grad_norm": 5.30003023147583,
+      "learning_rate": 3.2472836821637744e-05,
+      "loss": 2.0038,
+      "step": 232
+    },
+    {
+      "epoch": 4.401913875598086,
+      "grad_norm": 4.20980167388916,
+      "learning_rate": 3.1711746870594086e-05,
+      "loss": 1.9264,
+      "step": 233
+    },
+    {
+      "epoch": 4.421052631578947,
+      "grad_norm": 4.678532600402832,
+      "learning_rate": 3.0957997942825336e-05,
+      "loss": 1.9475,
+      "step": 234
+    },
+    {
+      "epoch": 4.440191387559809,
+      "grad_norm": 4.418569564819336,
+      "learning_rate": 3.021167106673928e-05,
+      "loss": 2.062,
+      "step": 235
+    },
+    {
+      "epoch": 4.45933014354067,
+      "grad_norm": 4.576781272888184,
+      "learning_rate": 2.9472846472869298e-05,
+      "loss": 2.2673,
+      "step": 236
+    },
+    {
+      "epoch": 4.478468899521531,
+      "grad_norm": 5.059473037719727,
+      "learning_rate": 2.874160358524931e-05,
+      "loss": 2.2399,
+      "step": 237
+    },
+    {
+      "epoch": 4.497607655502392,
+      "grad_norm": 5.032463073730469,
+      "learning_rate": 2.8018021012875994e-05,
+      "loss": 1.8512,
+      "step": 238
+    },
+    {
+      "epoch": 4.516746411483254,
+      "grad_norm": 4.410358428955078,
+      "learning_rate": 2.7302176541257986e-05,
+      "loss": 1.8909,
+      "step": 239
+    },
+    {
+      "epoch": 4.535885167464115,
+      "grad_norm": 4.2732319831848145,
+      "learning_rate": 2.659414712405398e-05,
+      "loss": 1.833,
+      "step": 240
+    },
+    {
+      "epoch": 4.555023923444976,
+      "grad_norm": 4.440384387969971,
+      "learning_rate": 2.5894008874800325e-05,
+      "loss": 1.8964,
+      "step": 241
+    },
+    {
+      "epoch": 4.574162679425838,
+      "grad_norm": 4.8430891036987305,
+      "learning_rate": 2.5201837058728505e-05,
+      "loss": 1.7943,
+      "step": 242
+    },
+    {
+      "epoch": 4.5933014354066986,
+      "grad_norm": 3.676851987838745,
+      "learning_rate": 2.451770608467432e-05,
+      "loss": 2.0328,
+      "step": 243
+    },
+    {
+      "epoch": 4.6124401913875595,
+      "grad_norm": 4.80816650390625,
+      "learning_rate": 2.3841689497078746e-05,
+      "loss": 2.1791,
+      "step": 244
+    },
+    {
+      "epoch": 4.631578947368421,
+      "grad_norm": 4.105157852172852,
+      "learning_rate": 2.3173859968081944e-05,
+      "loss": 2.2402,
+      "step": 245
+    },
+    {
+      "epoch": 4.650717703349282,
+      "grad_norm": 5.055697441101074,
+      "learning_rate": 2.251428928971102e-05,
+      "loss": 2.2174,
+      "step": 246
+    },
+    {
+      "epoch": 4.669856459330144,
+      "grad_norm": 5.220304012298584,
+      "learning_rate": 2.1863048366162208e-05,
+      "loss": 2.163,
+      "step": 247
+    },
+    {
+      "epoch": 4.688995215311005,
+      "grad_norm": 5.349198818206787,
+      "learning_rate": 2.1220207206178688e-05,
+      "loss": 1.8591,
+      "step": 248
+    },
+    {
+      "epoch": 4.708133971291866,
+      "grad_norm": 3.800992012023926,
+      "learning_rate": 2.058583491552465e-05,
+      "loss": 2.1511,
+      "step": 249
+    },
+    {
+      "epoch": 4.7272727272727275,
+      "grad_norm": 4.178462982177734,
+      "learning_rate": 1.995999968955641e-05,
+      "loss": 2.2553,
+      "step": 250
+    },
+    {
+      "epoch": 4.746411483253588,
+      "grad_norm": 5.495607852935791,
+      "learning_rate": 1.9342768805891178e-05,
+      "loss": 2.022,
+      "step": 251
+    },
+    {
+      "epoch": 4.76555023923445,
+      "grad_norm": 4.614135265350342,
+      "learning_rate": 1.8734208617174988e-05,
+      "loss": 2.1751,
+      "step": 252
+    },
+    {
+      "epoch": 4.784688995215311,
+      "grad_norm": 3.8945748805999756,
+      "learning_rate": 1.8134384543949478e-05,
+      "loss": 2.0986,
+      "step": 253
+    },
+    {
+      "epoch": 4.803827751196172,
+      "grad_norm": 5.491265773773193,
+      "learning_rate": 1.754336106761927e-05,
+      "loss": 1.8482,
+      "step": 254
+    },
+    {
+      "epoch": 4.822966507177034,
+      "grad_norm": 5.249953269958496,
+      "learning_rate": 1.696120172352025e-05,
+      "loss": 1.8416,
+      "step": 255
+    },
+    {
+      "epoch": 4.842105263157895,
+      "grad_norm": 4.254781246185303,
+      "learning_rate": 1.6387969094089316e-05,
+      "loss": 2.0863,
+      "step": 256
+    },
+    {
+      "epoch": 4.861244019138756,
+      "grad_norm": 5.3179779052734375,
+      "learning_rate": 1.5823724802136865e-05,
+      "loss": 2.2049,
+      "step": 257
+    },
+    {
+      "epoch": 4.880382775119617,
+      "grad_norm": 5.007632732391357,
+      "learning_rate": 1.526852950422226e-05,
+      "loss": 1.9023,
+      "step": 258
+    },
+    {
+      "epoch": 4.899521531100478,
+      "grad_norm": 3.3414082527160645,
+      "learning_rate": 1.4722442884133214e-05,
+      "loss": 2.0638,
+      "step": 259
+    },
+    {
+      "epoch": 4.91866028708134,
+      "grad_norm": 4.421596050262451,
+      "learning_rate": 1.4185523646469822e-05,
+      "loss": 2.0366,
+      "step": 260
+    },
+    {
+      "epoch": 4.937799043062201,
+      "grad_norm": 5.466579914093018,
+      "learning_rate": 1.3657829510333654e-05,
+      "loss": 1.8725,
+      "step": 261
+    },
+    {
+      "epoch": 4.956937799043062,
+      "grad_norm": 3.617340326309204,
+      "learning_rate": 1.3139417203123027e-05,
+      "loss": 2.0564,
+      "step": 262
+    },
+    {
+      "epoch": 4.976076555023924,
+      "grad_norm": 3.936239719390869,
+      "learning_rate": 1.263034245443473e-05,
+      "loss": 1.8447,
+      "step": 263
+    },
+    {
+      "epoch": 4.9952153110047846,
+      "grad_norm": 5.094753265380859,
+      "learning_rate": 1.2130659990073146e-05,
+      "loss": 1.7893,
+      "step": 264
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 8.643269538879395,
+      "learning_rate": 1.1640423526166988e-05,
+      "loss": 1.6151,
+      "step": 265
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 741644594380800.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-265/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8d8f75328c89f0f8d97ecc3fb21f0a76fa9b188979afd06060c2f286d07806
+size 6456

checkpoint-265/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-312/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-0.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-312/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-0.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-312/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b35702578df1a77f49fbaf57a0081dd8c817e7fe1a66b861ce989c67e99d0c2
+size 8676008