pawan2411 commited on Mar 8

Commit

820bae1

verified ·

1 Parent(s): 617116e

MPCOT v4: Deep Panini + 10-language CoT SFT adapter

Browse files

Files changed (42) hide show

.gitattributes +4 -0
README.md +62 -0
adapter_config.json +46 -0
adapter_model.safetensors +3 -0
chat_template.jinja +54 -0
checkpoint-1400/README.md +209 -0
checkpoint-1400/adapter_config.json +46 -0
checkpoint-1400/adapter_model.safetensors +3 -0
checkpoint-1400/chat_template.jinja +54 -0
checkpoint-1400/optimizer.pt +3 -0
checkpoint-1400/rng_state.pth +3 -0
checkpoint-1400/scheduler.pt +3 -0
checkpoint-1400/tokenizer.json +3 -0
checkpoint-1400/tokenizer_config.json +30 -0
checkpoint-1400/trainer_state.json +671 -0
checkpoint-1400/training_args.bin +3 -0
checkpoint-1600/README.md +209 -0
checkpoint-1600/adapter_config.json +46 -0
checkpoint-1600/adapter_model.safetensors +3 -0
checkpoint-1600/chat_template.jinja +54 -0
checkpoint-1600/optimizer.pt +3 -0
checkpoint-1600/rng_state.pth +3 -0
checkpoint-1600/scheduler.pt +3 -0
checkpoint-1600/tokenizer.json +3 -0
checkpoint-1600/tokenizer_config.json +30 -0
checkpoint-1600/trainer_state.json +762 -0
checkpoint-1600/training_args.bin +3 -0
checkpoint-1683/README.md +209 -0
checkpoint-1683/adapter_config.json +46 -0
checkpoint-1683/adapter_model.safetensors +3 -0
checkpoint-1683/chat_template.jinja +54 -0
checkpoint-1683/optimizer.pt +3 -0
checkpoint-1683/rng_state.pth +3 -0
checkpoint-1683/scheduler.pt +3 -0
checkpoint-1683/tokenizer.json +3 -0
checkpoint-1683/tokenizer_config.json +30 -0
checkpoint-1683/trainer_state.json +792 -0
checkpoint-1683/training_args.bin +3 -0
tokenizer.json +3 -0
tokenizer_config.json +30 -0
training_args.bin +3 -0
training_config.json +11 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1683/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: mpcot_qwen7b_lora
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for mpcot_qwen7b_lora
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.29.0
+- Transformers: 5.0.0
+- Pytorch: 2.10.0+cu128
+- Datasets: 4.0.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be15156e4206b36e88697f973f0757a6eb2e18abadf49ac66348796353b26c7c
+size 645975704

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-1400/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-1400/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-1400/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d49570902725151e1772a09d041780e1df02c9296722ec152853c17c967c6ef
+size 645975704

checkpoint-1400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-1400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f36b3e08fde39f1a70a3e960eb7d829189e8fd455fb30671eee91cfab4048829
+size 1292182139

checkpoint-1400/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c74bff3a7c4bc281b33b0b8e11d8123d149fa629cd264735e3d1419cd7b1386
+size 14645

checkpoint-1400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a57b1a48d66b223ec979eed79f59f1aeaab5c15f1823e44893266234805bbea6
+size 1465

checkpoint-1400/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

checkpoint-1400/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,671 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.4955436720142603,
+  "eval_steps": 200,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.1273316520452499,
+      "epoch": 0.044563279857397504,
+      "grad_norm": 2.0025203227996826,
+      "learning_rate": 3.6e-06,
+      "loss": 2.2611521911621093,
+      "mean_token_accuracy": 0.6291543507575988,
+      "num_tokens": 363359.0,
+      "step": 25
+    },
+    {
+      "entropy": 1.391269074678421,
+      "epoch": 0.08912655971479501,
+      "grad_norm": 1.0109221935272217,
+      "learning_rate": 7.35e-06,
+      "loss": 1.767060546875,
+      "mean_token_accuracy": 0.6594330656528473,
+      "num_tokens": 724311.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2697400665283203,
+      "epoch": 0.13368983957219252,
+      "grad_norm": 0.4787505269050598,
+      "learning_rate": 1.11e-05,
+      "loss": 1.231834716796875,
+      "mean_token_accuracy": 0.7449960750341416,
+      "num_tokens": 1089859.0,
+      "step": 75
+    },
+    {
+      "entropy": 1.0446020710468291,
+      "epoch": 0.17825311942959002,
+      "grad_norm": 0.39569053053855896,
+      "learning_rate": 1.485e-05,
+      "loss": 1.0368045043945313,
+      "mean_token_accuracy": 0.7771743559837341,
+      "num_tokens": 1456575.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.9694650781154632,
+      "epoch": 0.22281639928698752,
+      "grad_norm": 0.44794389605522156,
+      "learning_rate": 1.4991494309781894e-05,
+      "loss": 0.9510629272460938,
+      "mean_token_accuracy": 0.7904590421915054,
+      "num_tokens": 1819729.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.9067935299873352,
+      "epoch": 0.26737967914438504,
+      "grad_norm": 0.49256861209869385,
+      "learning_rate": 1.4964566090257208e-05,
+      "loss": 0.8909156036376953,
+      "mean_token_accuracy": 0.8001276826858521,
+      "num_tokens": 2185895.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.8762328952550889,
+      "epoch": 0.31194295900178254,
+      "grad_norm": 0.48932692408561707,
+      "learning_rate": 1.4919266844792835e-05,
+      "loss": 0.8628057098388672,
+      "mean_token_accuracy": 0.8043822544813156,
+      "num_tokens": 2554889.0,
+      "step": 175
+    },
+    {
+      "entropy": 0.8572803306579589,
+      "epoch": 0.35650623885918004,
+      "grad_norm": 0.5422897338867188,
+      "learning_rate": 1.485570805925459e-05,
+      "loss": 0.8397312927246093,
+      "mean_token_accuracy": 0.8085139858722686,
+      "num_tokens": 2920719.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "eval_entropy": 0.8323967654705048,
+      "eval_loss": 0.8291334509849548,
+      "eval_mean_token_accuracy": 0.8096436858177185,
+      "eval_num_tokens": 2920719.0,
+      "eval_runtime": 74.8901,
+      "eval_samples_per_second": 13.313,
+      "eval_steps_per_second": 3.338,
+      "step": 200
+    },
+    {
+      "entropy": 0.8403333276510239,
+      "epoch": 0.40106951871657753,
+      "grad_norm": 0.5801687240600586,
+      "learning_rate": 1.4774046158019147e-05,
+      "loss": 0.8235167694091797,
+      "mean_token_accuracy": 0.8098820477724076,
+      "num_tokens": 3288435.0,
+      "step": 225
+    },
+    {
+      "entropy": 0.817181087732315,
+      "epoch": 0.44563279857397503,
+      "grad_norm": 0.603880763053894,
+      "learning_rate": 1.467448211899838e-05,
+      "loss": 0.799793701171875,
+      "mean_token_accuracy": 0.8144050502777099,
+      "num_tokens": 3654202.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.8009092861413956,
+      "epoch": 0.49019607843137253,
+      "grad_norm": 0.5765889883041382,
+      "learning_rate": 1.4557260979013106e-05,
+      "loss": 0.7848175811767578,
+      "mean_token_accuracy": 0.8172187548875809,
+      "num_tokens": 4016287.0,
+      "step": 275
+    },
+    {
+      "entropy": 0.8024452942609787,
+      "epoch": 0.5347593582887701,
+      "grad_norm": 0.7014256715774536,
+      "learning_rate": 1.4422671230733536e-05,
+      "loss": 0.7894332122802734,
+      "mean_token_accuracy": 0.8166103160381317,
+      "num_tokens": 4379803.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.7904212397336959,
+      "epoch": 0.5793226381461676,
+      "grad_norm": 0.6543148756027222,
+      "learning_rate": 1.4271044112670647e-05,
+      "loss": 0.7738318634033203,
+      "mean_token_accuracy": 0.8181957858800888,
+      "num_tokens": 4748127.0,
+      "step": 325
+    },
+    {
+      "entropy": 0.7665162217617035,
+      "epoch": 0.6238859180035651,
+      "grad_norm": 0.7135517001152039,
+      "learning_rate": 1.410275279396588e-05,
+      "loss": 0.7533625030517578,
+      "mean_token_accuracy": 0.8217650431394578,
+      "num_tokens": 5113040.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.7557546135783195,
+      "epoch": 0.6684491978609626,
+      "grad_norm": 0.6762207746505737,
+      "learning_rate": 1.3918211455985435e-05,
+      "loss": 0.7417732238769531,
+      "mean_token_accuracy": 0.8234544372558594,
+      "num_tokens": 5477938.0,
+      "step": 375
+    },
+    {
+      "entropy": 0.7480651473999024,
+      "epoch": 0.7130124777183601,
+      "grad_norm": 0.6766519546508789,
+      "learning_rate": 1.3717874272979488e-05,
+      "loss": 0.7343754577636719,
+      "mean_token_accuracy": 0.8247038215398789,
+      "num_tokens": 5846777.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "eval_entropy": 0.7494170541763305,
+      "eval_loss": 0.7359814047813416,
+      "eval_mean_token_accuracy": 0.8247466235160827,
+      "eval_num_tokens": 5846777.0,
+      "eval_runtime": 74.7841,
+      "eval_samples_per_second": 13.332,
+      "eval_steps_per_second": 3.343,
+      "step": 400
+    },
+    {
+      "entropy": 0.7388822847604751,
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.7576785683631897,
+      "learning_rate": 1.350223429431504e-05,
+      "loss": 0.7303533935546875,
+      "mean_token_accuracy": 0.82606416285038,
+      "num_tokens": 6212618.0,
+      "step": 425
+    },
+    {
+      "entropy": 0.7430421102046967,
+      "epoch": 0.8021390374331551,
+      "grad_norm": 0.7369253635406494,
+      "learning_rate": 1.3271822231033263e-05,
+      "loss": 0.7292195129394531,
+      "mean_token_accuracy": 0.8252584689855575,
+      "num_tokens": 6578764.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.7350365900993348,
+      "epoch": 0.8467023172905526,
+      "grad_norm": 0.7027698755264282,
+      "learning_rate": 1.3027205149717825e-05,
+      "loss": 0.7203064727783203,
+      "mean_token_accuracy": 0.8271685636043549,
+      "num_tokens": 6940517.0,
+      "step": 475
+    },
+    {
+      "entropy": 0.7169802790880203,
+      "epoch": 0.8912655971479501,
+      "grad_norm": 0.7340224981307983,
+      "learning_rate": 1.276898507688866e-05,
+      "loss": 0.705379867553711,
+      "mean_token_accuracy": 0.8299148625135422,
+      "num_tokens": 7306466.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.7128468745946884,
+      "epoch": 0.9358288770053476,
+      "grad_norm": 0.7902767658233643,
+      "learning_rate": 1.2497797517355924e-05,
+      "loss": 0.6976683807373046,
+      "mean_token_accuracy": 0.8309504073858262,
+      "num_tokens": 7675590.0,
+      "step": 525
+    },
+    {
+      "entropy": 0.7067722028493881,
+      "epoch": 0.9803921568627451,
+      "grad_norm": 0.7943085432052612,
+      "learning_rate": 1.2214309890180613e-05,
+      "loss": 0.6949668884277344,
+      "mean_token_accuracy": 0.8305781084299088,
+      "num_tokens": 8042404.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.695909548997879,
+      "epoch": 1.0249554367201426,
+      "grad_norm": 0.7510514259338379,
+      "learning_rate": 1.191921988609109e-05,
+      "loss": 0.6792121124267578,
+      "mean_token_accuracy": 0.8343433332443237,
+      "num_tokens": 8403933.0,
+      "step": 575
+    },
+    {
+      "entropy": 0.6738390463590622,
+      "epoch": 1.0695187165775402,
+      "grad_norm": 0.8021165132522583,
+      "learning_rate": 1.1613253750398085e-05,
+      "loss": 0.6603101348876953,
+      "mean_token_accuracy": 0.8382544696331025,
+      "num_tokens": 8772072.0,
+      "step": 600
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "eval_entropy": 0.6920017371177674,
+      "eval_loss": 0.6961521506309509,
+      "eval_mean_token_accuracy": 0.8314581851959228,
+      "eval_num_tokens": 8772072.0,
+      "eval_runtime": 74.8097,
+      "eval_samples_per_second": 13.327,
+      "eval_steps_per_second": 3.342,
+      "step": 600
+    },
+    {
+      "entropy": 0.6920944279432297,
+      "epoch": 1.1140819964349375,
+      "grad_norm": 0.8023701310157776,
+      "learning_rate": 1.1297164495634069e-05,
+      "loss": 0.6772218322753907,
+      "mean_token_accuracy": 0.8343758553266525,
+      "num_tokens": 9137160.0,
+      "step": 625
+    },
+    {
+      "entropy": 0.67285136282444,
+      "epoch": 1.1586452762923352,
+      "grad_norm": 0.7788256406784058,
+      "learning_rate": 1.0971730048315917e-05,
+      "loss": 0.6581203460693359,
+      "mean_token_accuracy": 0.8390156370401383,
+      "num_tokens": 9505580.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.6888180702924729,
+      "epoch": 1.2032085561497325,
+      "grad_norm": 0.8268939256668091,
+      "learning_rate": 1.0637751334391775e-05,
+      "loss": 0.673553466796875,
+      "mean_token_accuracy": 0.8359775388240814,
+      "num_tokens": 9868570.0,
+      "step": 675
+    },
+    {
+      "entropy": 0.6915264892578125,
+      "epoch": 1.2477718360071302,
+      "grad_norm": 0.8361654877662659,
+      "learning_rate": 1.0296050308084114e-05,
+      "loss": 0.6790201568603516,
+      "mean_token_accuracy": 0.8342142343521118,
+      "num_tokens": 10229373.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.6885707491636276,
+      "epoch": 1.2923351158645278,
+      "grad_norm": 0.7386716604232788,
+      "learning_rate": 9.94746792898014e-06,
+      "loss": 0.6720596313476562,
+      "mean_token_accuracy": 0.8353542894124985,
+      "num_tokens": 10595419.0,
+      "step": 725
+    },
+    {
+      "entropy": 0.6660267195105553,
+      "epoch": 1.3368983957219251,
+      "grad_norm": 0.7973800897598267,
+      "learning_rate": 9.59286209234813e-06,
+      "loss": 0.6550118255615235,
+      "mean_token_accuracy": 0.8386269718408584,
+      "num_tokens": 10960517.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.6469692060351372,
+      "epoch": 1.3814616755793225,
+      "grad_norm": 0.798152506351471,
+      "learning_rate": 9.233105517773445e-06,
+      "loss": 0.6308420181274415,
+      "mean_token_accuracy": 0.8429271316528321,
+      "num_tokens": 11328702.0,
+      "step": 775
+    },
+    {
+      "entropy": 0.6708013540506363,
+      "epoch": 1.4260249554367201,
+      "grad_norm": 0.9537823796272278,
+      "learning_rate": 8.869083601310398e-06,
+      "loss": 0.6537622833251953,
+      "mean_token_accuracy": 0.838316883444786,
+      "num_tokens": 11697546.0,
+      "step": 800
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "eval_entropy": 0.670824561715126,
+      "eval_loss": 0.6723578572273254,
+      "eval_mean_token_accuracy": 0.8353032109737396,
+      "eval_num_tokens": 11697546.0,
+      "eval_runtime": 74.7664,
+      "eval_samples_per_second": 13.335,
+      "eval_steps_per_second": 3.344,
+      "step": 800
+    },
+    {
+      "entropy": 0.659270493388176,
+      "epoch": 1.4705882352941178,
+      "grad_norm": 0.846034586429596,
+      "learning_rate": 8.501692236436132e-06,
+      "loss": 0.6444293212890625,
+      "mean_token_accuracy": 0.8404667204618455,
+      "num_tokens": 12061827.0,
+      "step": 825
+    },
+    {
+      "entropy": 0.6627422112226486,
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.9181033968925476,
+      "learning_rate": 8.131835609169295e-06,
+      "loss": 0.6494012451171876,
+      "mean_token_accuracy": 0.839583694934845,
+      "num_tokens": 12427853.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.6641036707162857,
+      "epoch": 1.5597147950089125,
+      "grad_norm": 0.858001172542572,
+      "learning_rate": 7.760423972779985e-06,
+      "loss": 0.6495742797851562,
+      "mean_token_accuracy": 0.8395592844486237,
+      "num_tokens": 12799973.0,
+      "step": 875
+    },
+    {
+      "entropy": 0.6689085793495179,
+      "epoch": 1.6042780748663101,
+      "grad_norm": 0.8615349531173706,
+      "learning_rate": 7.388371407567565e-06,
+      "loss": 0.6532559967041016,
+      "mean_token_accuracy": 0.8388407498598098,
+      "num_tokens": 13166796.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.6729245400428772,
+      "epoch": 1.6488413547237077,
+      "grad_norm": 0.831142783164978,
+      "learning_rate": 7.01659357121981e-06,
+      "loss": 0.6572090911865235,
+      "mean_token_accuracy": 0.8372052818536758,
+      "num_tokens": 13532499.0,
+      "step": 925
+    },
+    {
+      "entropy": 0.6538485777378082,
+      "epoch": 1.6934046345811051,
+      "grad_norm": 0.919346809387207,
+      "learning_rate": 6.6460054452899315e-06,
+      "loss": 0.6404708862304688,
+      "mean_token_accuracy": 0.8411308795213699,
+      "num_tokens": 13898404.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.6691750481724739,
+      "epoch": 1.7379679144385025,
+      "grad_norm": 0.9280221462249756,
+      "learning_rate": 6.277519083337656e-06,
+      "loss": 0.6546466827392579,
+      "mean_token_accuracy": 0.838825848698616,
+      "num_tokens": 14261658.0,
+      "step": 975
+    },
+    {
+      "entropy": 0.6536609560251236,
+      "epoch": 1.7825311942959001,
+      "grad_norm": 0.9000495076179504,
+      "learning_rate": 5.9120413662763545e-06,
+      "loss": 0.6405950927734375,
+      "mean_token_accuracy": 0.8412596487998962,
+      "num_tokens": 14625008.0,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "eval_entropy": 0.6716028243303299,
+      "eval_loss": 0.6561057567596436,
+      "eval_mean_token_accuracy": 0.8381222817897797,
+      "eval_num_tokens": 14625008.0,
+      "eval_runtime": 74.7617,
+      "eval_samples_per_second": 13.336,
+      "eval_steps_per_second": 3.344,
+      "step": 1000
+    },
+    {
+      "entropy": 0.6671841683983802,
+      "epoch": 1.8270944741532977,
+      "grad_norm": 0.8711400628089905,
+      "learning_rate": 5.550471770450572e-06,
+      "loss": 0.6500684356689453,
+      "mean_token_accuracy": 0.8389109486341476,
+      "num_tokens": 14985559.0,
+      "step": 1025
+    },
+    {
+      "entropy": 0.6568678751587868,
+      "epoch": 1.8716577540106951,
+      "grad_norm": 0.9135516285896301,
+      "learning_rate": 5.193700153936934e-06,
+      "loss": 0.6418634033203126,
+      "mean_token_accuracy": 0.8414819967746735,
+      "num_tokens": 15354311.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.6430006143450737,
+      "epoch": 1.9162210338680927,
+      "grad_norm": 0.9346958994865417,
+      "learning_rate": 4.842604566516537e-06,
+      "loss": 0.6278348541259766,
+      "mean_token_accuracy": 0.8434987276792526,
+      "num_tokens": 15721382.0,
+      "step": 1075
+    },
+    {
+      "entropy": 0.6387567144632339,
+      "epoch": 1.9607843137254903,
+      "grad_norm": 0.9693854451179504,
+      "learning_rate": 4.498049088708706e-06,
+      "loss": 0.6229427337646485,
+      "mean_token_accuracy": 0.8442350590229034,
+      "num_tokens": 16088038.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.6434592244029045,
+      "epoch": 2.0053475935828877,
+      "grad_norm": 0.9158383011817932,
+      "learning_rate": 4.160881705184478e-06,
+      "loss": 0.6287346649169921,
+      "mean_token_accuracy": 0.8434397971630097,
+      "num_tokens": 16448228.0,
+      "step": 1125
+    },
+    {
+      "entropy": 0.6293540370464324,
+      "epoch": 2.049910873440285,
+      "grad_norm": 0.9278510808944702,
+      "learning_rate": 3.831932217793526e-06,
+      "loss": 0.6089762115478515,
+      "mean_token_accuracy": 0.8473779886960984,
+      "num_tokens": 16812866.0,
+      "step": 1150
+    },
+    {
+      "entropy": 0.6246551343798638,
+      "epoch": 2.0944741532976825,
+      "grad_norm": 0.8729245066642761,
+      "learning_rate": 3.5120102033408053e-06,
+      "loss": 0.6066710281372071,
+      "mean_token_accuracy": 0.8471958756446838,
+      "num_tokens": 17177909.0,
+      "step": 1175
+    },
+    {
+      "entropy": 0.6269071605801583,
+      "epoch": 2.1390374331550803,
+      "grad_norm": 0.8709802031517029,
+      "learning_rate": 3.201903021138983e-06,
+      "loss": 0.6111587905883789,
+      "mean_token_accuracy": 0.8464664667844772,
+      "num_tokens": 17544377.0,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1390374331550803,
+      "eval_entropy": 0.6344557646512985,
+      "eval_loss": 0.6462315320968628,
+      "eval_mean_token_accuracy": 0.8403205525875092,
+      "eval_num_tokens": 17544377.0,
+      "eval_runtime": 74.8344,
+      "eval_samples_per_second": 13.323,
+      "eval_steps_per_second": 3.341,
+      "step": 1200
+    },
+    {
+      "entropy": 0.617467094361782,
+      "epoch": 2.1836007130124777,
+      "grad_norm": 0.8771170973777771,
+      "learning_rate": 2.9023738752403013e-06,
+      "loss": 0.5986224746704102,
+      "mean_token_accuracy": 0.849560460448265,
+      "num_tokens": 17912855.0,
+      "step": 1225
+    },
+    {
+      "entropy": 0.6177873882651329,
+      "epoch": 2.228163992869875,
+      "grad_norm": 1.0253841876983643,
+      "learning_rate": 2.614159936116893e-06,
+      "loss": 0.5998103332519531,
+      "mean_token_accuracy": 0.8487882578372955,
+      "num_tokens": 18279476.0,
+      "step": 1250
+    },
+    {
+      "entropy": 0.6312283331155777,
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.9465038180351257,
+      "learning_rate": 2.337970526412267e-06,
+      "loss": 0.6118741226196289,
+      "mean_token_accuracy": 0.8458875006437302,
+      "num_tokens": 18644269.0,
+      "step": 1275
+    },
+    {
+      "entropy": 0.6209010258316994,
+      "epoch": 2.3172905525846703,
+      "grad_norm": 0.9807332158088684,
+      "learning_rate": 2.074485375229037e-06,
+      "loss": 0.6052029037475586,
+      "mean_token_accuracy": 0.8471564346551895,
+      "num_tokens": 19009107.0,
+      "step": 1300
+    },
+    {
+      "entropy": 0.6401337105035781,
+      "epoch": 2.3618538324420677,
+      "grad_norm": 1.0486506223678589,
+      "learning_rate": 1.82435294524924e-06,
+      "loss": 0.6207434463500977,
+      "mean_token_accuracy": 0.8439285135269166,
+      "num_tokens": 19374349.0,
+      "step": 1325
+    },
+    {
+      "entropy": 0.6109014016389847,
+      "epoch": 2.406417112299465,
+      "grad_norm": 0.9694714546203613,
+      "learning_rate": 1.5881888368043559e-06,
+      "loss": 0.5924215316772461,
+      "mean_token_accuracy": 0.8494464015960693,
+      "num_tokens": 19743047.0,
+      "step": 1350
+    },
+    {
+      "entropy": 0.6300237196683883,
+      "epoch": 2.450980392156863,
+      "grad_norm": 0.9961308836936951,
+      "learning_rate": 1.3665742728227932e-06,
+      "loss": 0.6133406066894531,
+      "mean_token_accuracy": 0.8462675029039383,
+      "num_tokens": 20105853.0,
+      "step": 1375
+    },
+    {
+      "entropy": 0.6148158556222916,
+      "epoch": 2.4955436720142603,
+      "grad_norm": 1.0224037170410156,
+      "learning_rate": 1.1600546683835065e-06,
+      "loss": 0.5978146362304687,
+      "mean_token_accuracy": 0.8488863033056259,
+      "num_tokens": 20469876.0,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4955436720142603,
+      "eval_entropy": 0.627735008597374,
+      "eval_loss": 0.6408645510673523,
+      "eval_mean_token_accuracy": 0.8411034562587738,
+      "eval_num_tokens": 20469876.0,
+      "eval_runtime": 74.7955,
+      "eval_samples_per_second": 13.33,
+      "eval_steps_per_second": 3.342,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 1683,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.869455724212224e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1400/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
+size 5585

checkpoint-1600/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-1600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-1600/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62be16623fffeb7fff7cfa473f3f40f65a8b24e56c10115f17f126702ebd0145
+size 645975704

checkpoint-1600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-1600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d64562ee726613097524ef08ae70a14248a08208bd4dbce81ba22d4e00986a6a
+size 1292182139

checkpoint-1600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbfa4fbea9e0f3d81284f0a321de33b26f22102eb534f6f79635582e04d4f709
+size 14645

checkpoint-1600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f33245ca6596ae95a0f5c97c8ab914705616abef8b0a7e2812b61318bef5fff
+size 1465

checkpoint-1600/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

checkpoint-1600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,762 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8520499108734403,
+  "eval_steps": 200,
+  "global_step": 1600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.1273316520452499,
+      "epoch": 0.044563279857397504,
+      "grad_norm": 2.0025203227996826,
+      "learning_rate": 3.6e-06,
+      "loss": 2.2611521911621093,
+      "mean_token_accuracy": 0.6291543507575988,
+      "num_tokens": 363359.0,
+      "step": 25
+    },
+    {
+      "entropy": 1.391269074678421,
+      "epoch": 0.08912655971479501,
+      "grad_norm": 1.0109221935272217,
+      "learning_rate": 7.35e-06,
+      "loss": 1.767060546875,
+      "mean_token_accuracy": 0.6594330656528473,
+      "num_tokens": 724311.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2697400665283203,
+      "epoch": 0.13368983957219252,
+      "grad_norm": 0.4787505269050598,
+      "learning_rate": 1.11e-05,
+      "loss": 1.231834716796875,
+      "mean_token_accuracy": 0.7449960750341416,
+      "num_tokens": 1089859.0,
+      "step": 75
+    },
+    {
+      "entropy": 1.0446020710468291,
+      "epoch": 0.17825311942959002,
+      "grad_norm": 0.39569053053855896,
+      "learning_rate": 1.485e-05,
+      "loss": 1.0368045043945313,
+      "mean_token_accuracy": 0.7771743559837341,
+      "num_tokens": 1456575.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.9694650781154632,
+      "epoch": 0.22281639928698752,
+      "grad_norm": 0.44794389605522156,
+      "learning_rate": 1.4991494309781894e-05,
+      "loss": 0.9510629272460938,
+      "mean_token_accuracy": 0.7904590421915054,
+      "num_tokens": 1819729.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.9067935299873352,
+      "epoch": 0.26737967914438504,
+      "grad_norm": 0.49256861209869385,
+      "learning_rate": 1.4964566090257208e-05,
+      "loss": 0.8909156036376953,
+      "mean_token_accuracy": 0.8001276826858521,
+      "num_tokens": 2185895.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.8762328952550889,
+      "epoch": 0.31194295900178254,
+      "grad_norm": 0.48932692408561707,
+      "learning_rate": 1.4919266844792835e-05,
+      "loss": 0.8628057098388672,
+      "mean_token_accuracy": 0.8043822544813156,
+      "num_tokens": 2554889.0,
+      "step": 175
+    },
+    {
+      "entropy": 0.8572803306579589,
+      "epoch": 0.35650623885918004,
+      "grad_norm": 0.5422897338867188,
+      "learning_rate": 1.485570805925459e-05,
+      "loss": 0.8397312927246093,
+      "mean_token_accuracy": 0.8085139858722686,
+      "num_tokens": 2920719.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "eval_entropy": 0.8323967654705048,
+      "eval_loss": 0.8291334509849548,
+      "eval_mean_token_accuracy": 0.8096436858177185,
+      "eval_num_tokens": 2920719.0,
+      "eval_runtime": 74.8901,
+      "eval_samples_per_second": 13.313,
+      "eval_steps_per_second": 3.338,
+      "step": 200
+    },
+    {
+      "entropy": 0.8403333276510239,
+      "epoch": 0.40106951871657753,
+      "grad_norm": 0.5801687240600586,
+      "learning_rate": 1.4774046158019147e-05,
+      "loss": 0.8235167694091797,
+      "mean_token_accuracy": 0.8098820477724076,
+      "num_tokens": 3288435.0,
+      "step": 225
+    },
+    {
+      "entropy": 0.817181087732315,
+      "epoch": 0.44563279857397503,
+      "grad_norm": 0.603880763053894,
+      "learning_rate": 1.467448211899838e-05,
+      "loss": 0.799793701171875,
+      "mean_token_accuracy": 0.8144050502777099,
+      "num_tokens": 3654202.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.8009092861413956,
+      "epoch": 0.49019607843137253,
+      "grad_norm": 0.5765889883041382,
+      "learning_rate": 1.4557260979013106e-05,
+      "loss": 0.7848175811767578,
+      "mean_token_accuracy": 0.8172187548875809,
+      "num_tokens": 4016287.0,
+      "step": 275
+    },
+    {
+      "entropy": 0.8024452942609787,
+      "epoch": 0.5347593582887701,
+      "grad_norm": 0.7014256715774536,
+      "learning_rate": 1.4422671230733536e-05,
+      "loss": 0.7894332122802734,
+      "mean_token_accuracy": 0.8166103160381317,
+      "num_tokens": 4379803.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.7904212397336959,
+      "epoch": 0.5793226381461676,
+      "grad_norm": 0.6543148756027222,
+      "learning_rate": 1.4271044112670647e-05,
+      "loss": 0.7738318634033203,
+      "mean_token_accuracy": 0.8181957858800888,
+      "num_tokens": 4748127.0,
+      "step": 325
+    },
+    {
+      "entropy": 0.7665162217617035,
+      "epoch": 0.6238859180035651,
+      "grad_norm": 0.7135517001152039,
+      "learning_rate": 1.410275279396588e-05,
+      "loss": 0.7533625030517578,
+      "mean_token_accuracy": 0.8217650431394578,
+      "num_tokens": 5113040.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.7557546135783195,
+      "epoch": 0.6684491978609626,
+      "grad_norm": 0.6762207746505737,
+      "learning_rate": 1.3918211455985435e-05,
+      "loss": 0.7417732238769531,
+      "mean_token_accuracy": 0.8234544372558594,
+      "num_tokens": 5477938.0,
+      "step": 375
+    },
+    {
+      "entropy": 0.7480651473999024,
+      "epoch": 0.7130124777183601,
+      "grad_norm": 0.6766519546508789,
+      "learning_rate": 1.3717874272979488e-05,
+      "loss": 0.7343754577636719,
+      "mean_token_accuracy": 0.8247038215398789,
+      "num_tokens": 5846777.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "eval_entropy": 0.7494170541763305,
+      "eval_loss": 0.7359814047813416,
+      "eval_mean_token_accuracy": 0.8247466235160827,
+      "eval_num_tokens": 5846777.0,
+      "eval_runtime": 74.7841,
+      "eval_samples_per_second": 13.332,
+      "eval_steps_per_second": 3.343,
+      "step": 400
+    },
+    {
+      "entropy": 0.7388822847604751,
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.7576785683631897,
+      "learning_rate": 1.350223429431504e-05,
+      "loss": 0.7303533935546875,
+      "mean_token_accuracy": 0.82606416285038,
+      "num_tokens": 6212618.0,
+      "step": 425
+    },
+    {
+      "entropy": 0.7430421102046967,
+      "epoch": 0.8021390374331551,
+      "grad_norm": 0.7369253635406494,
+      "learning_rate": 1.3271822231033263e-05,
+      "loss": 0.7292195129394531,
+      "mean_token_accuracy": 0.8252584689855575,
+      "num_tokens": 6578764.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.7350365900993348,
+      "epoch": 0.8467023172905526,
+      "grad_norm": 0.7027698755264282,
+      "learning_rate": 1.3027205149717825e-05,
+      "loss": 0.7203064727783203,
+      "mean_token_accuracy": 0.8271685636043549,
+      "num_tokens": 6940517.0,
+      "step": 475
+    },
+    {
+      "entropy": 0.7169802790880203,
+      "epoch": 0.8912655971479501,
+      "grad_norm": 0.7340224981307983,
+      "learning_rate": 1.276898507688866e-05,
+      "loss": 0.705379867553711,
+      "mean_token_accuracy": 0.8299148625135422,
+      "num_tokens": 7306466.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.7128468745946884,
+      "epoch": 0.9358288770053476,
+      "grad_norm": 0.7902767658233643,
+      "learning_rate": 1.2497797517355924e-05,
+      "loss": 0.6976683807373046,
+      "mean_token_accuracy": 0.8309504073858262,
+      "num_tokens": 7675590.0,
+      "step": 525
+    },
+    {
+      "entropy": 0.7067722028493881,
+      "epoch": 0.9803921568627451,
+      "grad_norm": 0.7943085432052612,
+      "learning_rate": 1.2214309890180613e-05,
+      "loss": 0.6949668884277344,
+      "mean_token_accuracy": 0.8305781084299088,
+      "num_tokens": 8042404.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.695909548997879,
+      "epoch": 1.0249554367201426,
+      "grad_norm": 0.7510514259338379,
+      "learning_rate": 1.191921988609109e-05,
+      "loss": 0.6792121124267578,
+      "mean_token_accuracy": 0.8343433332443237,
+      "num_tokens": 8403933.0,
+      "step": 575
+    },
+    {
+      "entropy": 0.6738390463590622,
+      "epoch": 1.0695187165775402,
+      "grad_norm": 0.8021165132522583,
+      "learning_rate": 1.1613253750398085e-05,
+      "loss": 0.6603101348876953,
+      "mean_token_accuracy": 0.8382544696331025,
+      "num_tokens": 8772072.0,
+      "step": 600
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "eval_entropy": 0.6920017371177674,
+      "eval_loss": 0.6961521506309509,
+      "eval_mean_token_accuracy": 0.8314581851959228,
+      "eval_num_tokens": 8772072.0,
+      "eval_runtime": 74.8097,
+      "eval_samples_per_second": 13.327,
+      "eval_steps_per_second": 3.342,
+      "step": 600
+    },
+    {
+      "entropy": 0.6920944279432297,
+      "epoch": 1.1140819964349375,
+      "grad_norm": 0.8023701310157776,
+      "learning_rate": 1.1297164495634069e-05,
+      "loss": 0.6772218322753907,
+      "mean_token_accuracy": 0.8343758553266525,
+      "num_tokens": 9137160.0,
+      "step": 625
+    },
+    {
+      "entropy": 0.67285136282444,
+      "epoch": 1.1586452762923352,
+      "grad_norm": 0.7788256406784058,
+      "learning_rate": 1.0971730048315917e-05,
+      "loss": 0.6581203460693359,
+      "mean_token_accuracy": 0.8390156370401383,
+      "num_tokens": 9505580.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.6888180702924729,
+      "epoch": 1.2032085561497325,
+      "grad_norm": 0.8268939256668091,
+      "learning_rate": 1.0637751334391775e-05,
+      "loss": 0.673553466796875,
+      "mean_token_accuracy": 0.8359775388240814,
+      "num_tokens": 9868570.0,
+      "step": 675
+    },
+    {
+      "entropy": 0.6915264892578125,
+      "epoch": 1.2477718360071302,
+      "grad_norm": 0.8361654877662659,
+      "learning_rate": 1.0296050308084114e-05,
+      "loss": 0.6790201568603516,
+      "mean_token_accuracy": 0.8342142343521118,
+      "num_tokens": 10229373.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.6885707491636276,
+      "epoch": 1.2923351158645278,
+      "grad_norm": 0.7386716604232788,
+      "learning_rate": 9.94746792898014e-06,
+      "loss": 0.6720596313476562,
+      "mean_token_accuracy": 0.8353542894124985,
+      "num_tokens": 10595419.0,
+      "step": 725
+    },
+    {
+      "entropy": 0.6660267195105553,
+      "epoch": 1.3368983957219251,
+      "grad_norm": 0.7973800897598267,
+      "learning_rate": 9.59286209234813e-06,
+      "loss": 0.6550118255615235,
+      "mean_token_accuracy": 0.8386269718408584,
+      "num_tokens": 10960517.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.6469692060351372,
+      "epoch": 1.3814616755793225,
+      "grad_norm": 0.798152506351471,
+      "learning_rate": 9.233105517773445e-06,
+      "loss": 0.6308420181274415,
+      "mean_token_accuracy": 0.8429271316528321,
+      "num_tokens": 11328702.0,
+      "step": 775
+    },
+    {
+      "entropy": 0.6708013540506363,
+      "epoch": 1.4260249554367201,
+      "grad_norm": 0.9537823796272278,
+      "learning_rate": 8.869083601310398e-06,
+      "loss": 0.6537622833251953,
+      "mean_token_accuracy": 0.838316883444786,
+      "num_tokens": 11697546.0,
+      "step": 800
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "eval_entropy": 0.670824561715126,
+      "eval_loss": 0.6723578572273254,
+      "eval_mean_token_accuracy": 0.8353032109737396,
+      "eval_num_tokens": 11697546.0,
+      "eval_runtime": 74.7664,
+      "eval_samples_per_second": 13.335,
+      "eval_steps_per_second": 3.344,
+      "step": 800
+    },
+    {
+      "entropy": 0.659270493388176,
+      "epoch": 1.4705882352941178,
+      "grad_norm": 0.846034586429596,
+      "learning_rate": 8.501692236436132e-06,
+      "loss": 0.6444293212890625,
+      "mean_token_accuracy": 0.8404667204618455,
+      "num_tokens": 12061827.0,
+      "step": 825
+    },
+    {
+      "entropy": 0.6627422112226486,
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.9181033968925476,
+      "learning_rate": 8.131835609169295e-06,
+      "loss": 0.6494012451171876,
+      "mean_token_accuracy": 0.839583694934845,
+      "num_tokens": 12427853.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.6641036707162857,
+      "epoch": 1.5597147950089125,
+      "grad_norm": 0.858001172542572,
+      "learning_rate": 7.760423972779985e-06,
+      "loss": 0.6495742797851562,
+      "mean_token_accuracy": 0.8395592844486237,
+      "num_tokens": 12799973.0,
+      "step": 875
+    },
+    {
+      "entropy": 0.6689085793495179,
+      "epoch": 1.6042780748663101,
+      "grad_norm": 0.8615349531173706,
+      "learning_rate": 7.388371407567565e-06,
+      "loss": 0.6532559967041016,
+      "mean_token_accuracy": 0.8388407498598098,
+      "num_tokens": 13166796.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.6729245400428772,
+      "epoch": 1.6488413547237077,
+      "grad_norm": 0.831142783164978,
+      "learning_rate": 7.01659357121981e-06,
+      "loss": 0.6572090911865235,
+      "mean_token_accuracy": 0.8372052818536758,
+      "num_tokens": 13532499.0,
+      "step": 925
+    },
+    {
+      "entropy": 0.6538485777378082,
+      "epoch": 1.6934046345811051,
+      "grad_norm": 0.919346809387207,
+      "learning_rate": 6.6460054452899315e-06,
+      "loss": 0.6404708862304688,
+      "mean_token_accuracy": 0.8411308795213699,
+      "num_tokens": 13898404.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.6691750481724739,
+      "epoch": 1.7379679144385025,
+      "grad_norm": 0.9280221462249756,
+      "learning_rate": 6.277519083337656e-06,
+      "loss": 0.6546466827392579,
+      "mean_token_accuracy": 0.838825848698616,
+      "num_tokens": 14261658.0,
+      "step": 975
+    },
+    {
+      "entropy": 0.6536609560251236,
+      "epoch": 1.7825311942959001,
+      "grad_norm": 0.9000495076179504,
+      "learning_rate": 5.9120413662763545e-06,
+      "loss": 0.6405950927734375,
+      "mean_token_accuracy": 0.8412596487998962,
+      "num_tokens": 14625008.0,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "eval_entropy": 0.6716028243303299,
+      "eval_loss": 0.6561057567596436,
+      "eval_mean_token_accuracy": 0.8381222817897797,
+      "eval_num_tokens": 14625008.0,
+      "eval_runtime": 74.7617,
+      "eval_samples_per_second": 13.336,
+      "eval_steps_per_second": 3.344,
+      "step": 1000
+    },
+    {
+      "entropy": 0.6671841683983802,
+      "epoch": 1.8270944741532977,
+      "grad_norm": 0.8711400628089905,
+      "learning_rate": 5.550471770450572e-06,
+      "loss": 0.6500684356689453,
+      "mean_token_accuracy": 0.8389109486341476,
+      "num_tokens": 14985559.0,
+      "step": 1025
+    },
+    {
+      "entropy": 0.6568678751587868,
+      "epoch": 1.8716577540106951,
+      "grad_norm": 0.9135516285896301,
+      "learning_rate": 5.193700153936934e-06,
+      "loss": 0.6418634033203126,
+      "mean_token_accuracy": 0.8414819967746735,
+      "num_tokens": 15354311.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.6430006143450737,
+      "epoch": 1.9162210338680927,
+      "grad_norm": 0.9346958994865417,
+      "learning_rate": 4.842604566516537e-06,
+      "loss": 0.6278348541259766,
+      "mean_token_accuracy": 0.8434987276792526,
+      "num_tokens": 15721382.0,
+      "step": 1075
+    },
+    {
+      "entropy": 0.6387567144632339,
+      "epoch": 1.9607843137254903,
+      "grad_norm": 0.9693854451179504,
+      "learning_rate": 4.498049088708706e-06,
+      "loss": 0.6229427337646485,
+      "mean_token_accuracy": 0.8442350590229034,
+      "num_tokens": 16088038.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.6434592244029045,
+      "epoch": 2.0053475935828877,
+      "grad_norm": 0.9158383011817932,
+      "learning_rate": 4.160881705184478e-06,
+      "loss": 0.6287346649169921,
+      "mean_token_accuracy": 0.8434397971630097,
+      "num_tokens": 16448228.0,
+      "step": 1125
+    },
+    {
+      "entropy": 0.6293540370464324,
+      "epoch": 2.049910873440285,
+      "grad_norm": 0.9278510808944702,
+      "learning_rate": 3.831932217793526e-06,
+      "loss": 0.6089762115478515,
+      "mean_token_accuracy": 0.8473779886960984,
+      "num_tokens": 16812866.0,
+      "step": 1150
+    },
+    {
+      "entropy": 0.6246551343798638,
+      "epoch": 2.0944741532976825,
+      "grad_norm": 0.8729245066642761,
+      "learning_rate": 3.5120102033408053e-06,
+      "loss": 0.6066710281372071,
+      "mean_token_accuracy": 0.8471958756446838,
+      "num_tokens": 17177909.0,
+      "step": 1175
+    },
+    {
+      "entropy": 0.6269071605801583,
+      "epoch": 2.1390374331550803,
+      "grad_norm": 0.8709802031517029,
+      "learning_rate": 3.201903021138983e-06,
+      "loss": 0.6111587905883789,
+      "mean_token_accuracy": 0.8464664667844772,
+      "num_tokens": 17544377.0,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1390374331550803,
+      "eval_entropy": 0.6344557646512985,
+      "eval_loss": 0.6462315320968628,
+      "eval_mean_token_accuracy": 0.8403205525875092,
+      "eval_num_tokens": 17544377.0,
+      "eval_runtime": 74.8344,
+      "eval_samples_per_second": 13.323,
+      "eval_steps_per_second": 3.341,
+      "step": 1200
+    },
+    {
+      "entropy": 0.617467094361782,
+      "epoch": 2.1836007130124777,
+      "grad_norm": 0.8771170973777771,
+      "learning_rate": 2.9023738752403013e-06,
+      "loss": 0.5986224746704102,
+      "mean_token_accuracy": 0.849560460448265,
+      "num_tokens": 17912855.0,
+      "step": 1225
+    },
+    {
+      "entropy": 0.6177873882651329,
+      "epoch": 2.228163992869875,
+      "grad_norm": 1.0253841876983643,
+      "learning_rate": 2.614159936116893e-06,
+      "loss": 0.5998103332519531,
+      "mean_token_accuracy": 0.8487882578372955,
+      "num_tokens": 18279476.0,
+      "step": 1250
+    },
+    {
+      "entropy": 0.6312283331155777,
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.9465038180351257,
+      "learning_rate": 2.337970526412267e-06,
+      "loss": 0.6118741226196289,
+      "mean_token_accuracy": 0.8458875006437302,
+      "num_tokens": 18644269.0,
+      "step": 1275
+    },
+    {
+      "entropy": 0.6209010258316994,
+      "epoch": 2.3172905525846703,
+      "grad_norm": 0.9807332158088684,
+      "learning_rate": 2.074485375229037e-06,
+      "loss": 0.6052029037475586,
+      "mean_token_accuracy": 0.8471564346551895,
+      "num_tokens": 19009107.0,
+      "step": 1300
+    },
+    {
+      "entropy": 0.6401337105035781,
+      "epoch": 2.3618538324420677,
+      "grad_norm": 1.0486506223678589,
+      "learning_rate": 1.82435294524924e-06,
+      "loss": 0.6207434463500977,
+      "mean_token_accuracy": 0.8439285135269166,
+      "num_tokens": 19374349.0,
+      "step": 1325
+    },
+    {
+      "entropy": 0.6109014016389847,
+      "epoch": 2.406417112299465,
+      "grad_norm": 0.9694714546203613,
+      "learning_rate": 1.5881888368043559e-06,
+      "loss": 0.5924215316772461,
+      "mean_token_accuracy": 0.8494464015960693,
+      "num_tokens": 19743047.0,
+      "step": 1350
+    },
+    {
+      "entropy": 0.6300237196683883,
+      "epoch": 2.450980392156863,
+      "grad_norm": 0.9961308836936951,
+      "learning_rate": 1.3665742728227932e-06,
+      "loss": 0.6133406066894531,
+      "mean_token_accuracy": 0.8462675029039383,
+      "num_tokens": 20105853.0,
+      "step": 1375
+    },
+    {
+      "entropy": 0.6148158556222916,
+      "epoch": 2.4955436720142603,
+      "grad_norm": 1.0224037170410156,
+      "learning_rate": 1.1600546683835065e-06,
+      "loss": 0.5978146362304687,
+      "mean_token_accuracy": 0.8488863033056259,
+      "num_tokens": 20469876.0,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4955436720142603,
+      "eval_entropy": 0.627735008597374,
+      "eval_loss": 0.6408645510673523,
+      "eval_mean_token_accuracy": 0.8411034562587738,
+      "eval_num_tokens": 20469876.0,
+      "eval_runtime": 74.7955,
+      "eval_samples_per_second": 13.33,
+      "eval_steps_per_second": 3.342,
+      "step": 1400
+    },
+    {
+      "entropy": 0.6221208718419075,
+      "epoch": 2.5401069518716577,
+      "grad_norm": 1.0483691692352295,
+      "learning_rate": 9.691382883962515e-07,
+      "loss": 0.6043234634399414,
+      "mean_token_accuracy": 0.8475923782587051,
+      "num_tokens": 20834908.0,
+      "step": 1425
+    },
+    {
+      "entropy": 0.6163172733783722,
+      "epoch": 2.5846702317290555,
+      "grad_norm": 1.0169743299484253,
+      "learning_rate": 7.942949967120098e-07,
+      "loss": 0.6007443237304687,
+      "mean_token_accuracy": 0.8487154805660247,
+      "num_tokens": 21199575.0,
+      "step": 1450
+    },
+    {
+      "entropy": 0.6306376928091049,
+      "epoch": 2.629233511586453,
+      "grad_norm": 0.9749926328659058,
+      "learning_rate": 6.359550997421698e-07,
+      "loss": 0.6101107406616211,
+      "mean_token_accuracy": 0.8469714081287384,
+      "num_tokens": 21564414.0,
+      "step": 1475
+    },
+    {
+      "entropy": 0.6135326558351517,
+      "epoch": 2.6737967914438503,
+      "grad_norm": 1.0116835832595825,
+      "learning_rate": 4.945082874324541e-07,
+      "loss": 0.5956003189086914,
+      "mean_token_accuracy": 0.8500852519273758,
+      "num_tokens": 21928080.0,
+      "step": 1500
+    },
+    {
+      "entropy": 0.6165187922120094,
+      "epoch": 2.7183600713012477,
+      "grad_norm": 0.9928510785102844,
+      "learning_rate": 3.7030267419789764e-07,
+      "loss": 0.6013864135742187,
+      "mean_token_accuracy": 0.8494158619642258,
+      "num_tokens": 22296207.0,
+      "step": 1525
+    },
+    {
+      "entropy": 0.619775217473507,
+      "epoch": 2.762923351158645,
+      "grad_norm": 0.9901552796363831,
+      "learning_rate": 2.6364394217929856e-07,
+      "loss": 0.6034153366088867,
+      "mean_token_accuracy": 0.8480428576469421,
+      "num_tokens": 22661645.0,
+      "step": 1550
+    },
+    {
+      "entropy": 0.6086369237303734,
+      "epoch": 2.807486631016043,
+      "grad_norm": 0.8772838711738586,
+      "learning_rate": 1.7479458892961846e-07,
+      "loss": 0.5885520553588868,
+      "mean_token_accuracy": 0.8515380412340164,
+      "num_tokens": 23028234.0,
+      "step": 1575
+    },
+    {
+      "entropy": 0.6151553666591645,
+      "epoch": 2.8520499108734403,
+      "grad_norm": 0.9948622584342957,
+      "learning_rate": 1.0397328138187557e-07,
+      "loss": 0.5963270568847656,
+      "mean_token_accuracy": 0.8506602907180786,
+      "num_tokens": 23393799.0,
+      "step": 1600
+    },
+    {
+      "epoch": 2.8520499108734403,
+      "eval_entropy": 0.626643338561058,
+      "eval_loss": 0.6390902400016785,
+      "eval_mean_token_accuracy": 0.8414715526103973,
+      "eval_num_tokens": 23393799.0,
+      "eval_runtime": 74.7969,
+      "eval_samples_per_second": 13.329,
+      "eval_steps_per_second": 3.342,
+      "step": 1600
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 1683,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1278684184859034e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
+size 5585

checkpoint-1683/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-1683/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-1683/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be15156e4206b36e88697f973f0757a6eb2e18abadf49ac66348796353b26c7c
+size 645975704

checkpoint-1683/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-1683/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e6d833bf1cb9a5f8f663f12115ab5eb20f4a51a30008dd5ca9f77cbaf44b23b
+size 1292182139

checkpoint-1683/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19a191961b95a03b05655966beedb7977207e4e2a61b0fb5a169be43daad40f4
+size 14645

checkpoint-1683/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85d2044299f996170d57bec12325224a16707d54a6bba50a223724ae1ebb0267
+size 1465

checkpoint-1683/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

checkpoint-1683/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1683/trainer_state.json ADDED Viewed

	@@ -0,0 +1,792 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 200,
+  "global_step": 1683,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.1273316520452499,
+      "epoch": 0.044563279857397504,
+      "grad_norm": 2.0025203227996826,
+      "learning_rate": 3.6e-06,
+      "loss": 2.2611521911621093,
+      "mean_token_accuracy": 0.6291543507575988,
+      "num_tokens": 363359.0,
+      "step": 25
+    },
+    {
+      "entropy": 1.391269074678421,
+      "epoch": 0.08912655971479501,
+      "grad_norm": 1.0109221935272217,
+      "learning_rate": 7.35e-06,
+      "loss": 1.767060546875,
+      "mean_token_accuracy": 0.6594330656528473,
+      "num_tokens": 724311.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2697400665283203,
+      "epoch": 0.13368983957219252,
+      "grad_norm": 0.4787505269050598,
+      "learning_rate": 1.11e-05,
+      "loss": 1.231834716796875,
+      "mean_token_accuracy": 0.7449960750341416,
+      "num_tokens": 1089859.0,
+      "step": 75
+    },
+    {
+      "entropy": 1.0446020710468291,
+      "epoch": 0.17825311942959002,
+      "grad_norm": 0.39569053053855896,
+      "learning_rate": 1.485e-05,
+      "loss": 1.0368045043945313,
+      "mean_token_accuracy": 0.7771743559837341,
+      "num_tokens": 1456575.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.9694650781154632,
+      "epoch": 0.22281639928698752,
+      "grad_norm": 0.44794389605522156,
+      "learning_rate": 1.4991494309781894e-05,
+      "loss": 0.9510629272460938,
+      "mean_token_accuracy": 0.7904590421915054,
+      "num_tokens": 1819729.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.9067935299873352,
+      "epoch": 0.26737967914438504,
+      "grad_norm": 0.49256861209869385,
+      "learning_rate": 1.4964566090257208e-05,
+      "loss": 0.8909156036376953,
+      "mean_token_accuracy": 0.8001276826858521,
+      "num_tokens": 2185895.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.8762328952550889,
+      "epoch": 0.31194295900178254,
+      "grad_norm": 0.48932692408561707,
+      "learning_rate": 1.4919266844792835e-05,
+      "loss": 0.8628057098388672,
+      "mean_token_accuracy": 0.8043822544813156,
+      "num_tokens": 2554889.0,
+      "step": 175
+    },
+    {
+      "entropy": 0.8572803306579589,
+      "epoch": 0.35650623885918004,
+      "grad_norm": 0.5422897338867188,
+      "learning_rate": 1.485570805925459e-05,
+      "loss": 0.8397312927246093,
+      "mean_token_accuracy": 0.8085139858722686,
+      "num_tokens": 2920719.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "eval_entropy": 0.8323967654705048,
+      "eval_loss": 0.8291334509849548,
+      "eval_mean_token_accuracy": 0.8096436858177185,
+      "eval_num_tokens": 2920719.0,
+      "eval_runtime": 74.8901,
+      "eval_samples_per_second": 13.313,
+      "eval_steps_per_second": 3.338,
+      "step": 200
+    },
+    {
+      "entropy": 0.8403333276510239,
+      "epoch": 0.40106951871657753,
+      "grad_norm": 0.5801687240600586,
+      "learning_rate": 1.4774046158019147e-05,
+      "loss": 0.8235167694091797,
+      "mean_token_accuracy": 0.8098820477724076,
+      "num_tokens": 3288435.0,
+      "step": 225
+    },
+    {
+      "entropy": 0.817181087732315,
+      "epoch": 0.44563279857397503,
+      "grad_norm": 0.603880763053894,
+      "learning_rate": 1.467448211899838e-05,
+      "loss": 0.799793701171875,
+      "mean_token_accuracy": 0.8144050502777099,
+      "num_tokens": 3654202.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.8009092861413956,
+      "epoch": 0.49019607843137253,
+      "grad_norm": 0.5765889883041382,
+      "learning_rate": 1.4557260979013106e-05,
+      "loss": 0.7848175811767578,
+      "mean_token_accuracy": 0.8172187548875809,
+      "num_tokens": 4016287.0,
+      "step": 275
+    },
+    {
+      "entropy": 0.8024452942609787,
+      "epoch": 0.5347593582887701,
+      "grad_norm": 0.7014256715774536,
+      "learning_rate": 1.4422671230733536e-05,
+      "loss": 0.7894332122802734,
+      "mean_token_accuracy": 0.8166103160381317,
+      "num_tokens": 4379803.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.7904212397336959,
+      "epoch": 0.5793226381461676,
+      "grad_norm": 0.6543148756027222,
+      "learning_rate": 1.4271044112670647e-05,
+      "loss": 0.7738318634033203,
+      "mean_token_accuracy": 0.8181957858800888,
+      "num_tokens": 4748127.0,
+      "step": 325
+    },
+    {
+      "entropy": 0.7665162217617035,
+      "epoch": 0.6238859180035651,
+      "grad_norm": 0.7135517001152039,
+      "learning_rate": 1.410275279396588e-05,
+      "loss": 0.7533625030517578,
+      "mean_token_accuracy": 0.8217650431394578,
+      "num_tokens": 5113040.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.7557546135783195,
+      "epoch": 0.6684491978609626,
+      "grad_norm": 0.6762207746505737,
+      "learning_rate": 1.3918211455985435e-05,
+      "loss": 0.7417732238769531,
+      "mean_token_accuracy": 0.8234544372558594,
+      "num_tokens": 5477938.0,
+      "step": 375
+    },
+    {
+      "entropy": 0.7480651473999024,
+      "epoch": 0.7130124777183601,
+      "grad_norm": 0.6766519546508789,
+      "learning_rate": 1.3717874272979488e-05,
+      "loss": 0.7343754577636719,
+      "mean_token_accuracy": 0.8247038215398789,
+      "num_tokens": 5846777.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "eval_entropy": 0.7494170541763305,
+      "eval_loss": 0.7359814047813416,
+      "eval_mean_token_accuracy": 0.8247466235160827,
+      "eval_num_tokens": 5846777.0,
+      "eval_runtime": 74.7841,
+      "eval_samples_per_second": 13.332,
+      "eval_steps_per_second": 3.343,
+      "step": 400
+    },
+    {
+      "entropy": 0.7388822847604751,
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.7576785683631897,
+      "learning_rate": 1.350223429431504e-05,
+      "loss": 0.7303533935546875,
+      "mean_token_accuracy": 0.82606416285038,
+      "num_tokens": 6212618.0,
+      "step": 425
+    },
+    {
+      "entropy": 0.7430421102046967,
+      "epoch": 0.8021390374331551,
+      "grad_norm": 0.7369253635406494,
+      "learning_rate": 1.3271822231033263e-05,
+      "loss": 0.7292195129394531,
+      "mean_token_accuracy": 0.8252584689855575,
+      "num_tokens": 6578764.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.7350365900993348,
+      "epoch": 0.8467023172905526,
+      "grad_norm": 0.7027698755264282,
+      "learning_rate": 1.3027205149717825e-05,
+      "loss": 0.7203064727783203,
+      "mean_token_accuracy": 0.8271685636043549,
+      "num_tokens": 6940517.0,
+      "step": 475
+    },
+    {
+      "entropy": 0.7169802790880203,
+      "epoch": 0.8912655971479501,
+      "grad_norm": 0.7340224981307983,
+      "learning_rate": 1.276898507688866e-05,
+      "loss": 0.705379867553711,
+      "mean_token_accuracy": 0.8299148625135422,
+      "num_tokens": 7306466.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.7128468745946884,
+      "epoch": 0.9358288770053476,
+      "grad_norm": 0.7902767658233643,
+      "learning_rate": 1.2497797517355924e-05,
+      "loss": 0.6976683807373046,
+      "mean_token_accuracy": 0.8309504073858262,
+      "num_tokens": 7675590.0,
+      "step": 525
+    },
+    {
+      "entropy": 0.7067722028493881,
+      "epoch": 0.9803921568627451,
+      "grad_norm": 0.7943085432052612,
+      "learning_rate": 1.2214309890180613e-05,
+      "loss": 0.6949668884277344,
+      "mean_token_accuracy": 0.8305781084299088,
+      "num_tokens": 8042404.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.695909548997879,
+      "epoch": 1.0249554367201426,
+      "grad_norm": 0.7510514259338379,
+      "learning_rate": 1.191921988609109e-05,
+      "loss": 0.6792121124267578,
+      "mean_token_accuracy": 0.8343433332443237,
+      "num_tokens": 8403933.0,
+      "step": 575
+    },
+    {
+      "entropy": 0.6738390463590622,
+      "epoch": 1.0695187165775402,
+      "grad_norm": 0.8021165132522583,
+      "learning_rate": 1.1613253750398085e-05,
+      "loss": 0.6603101348876953,
+      "mean_token_accuracy": 0.8382544696331025,
+      "num_tokens": 8772072.0,
+      "step": 600
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "eval_entropy": 0.6920017371177674,
+      "eval_loss": 0.6961521506309509,
+      "eval_mean_token_accuracy": 0.8314581851959228,
+      "eval_num_tokens": 8772072.0,
+      "eval_runtime": 74.8097,
+      "eval_samples_per_second": 13.327,
+      "eval_steps_per_second": 3.342,
+      "step": 600
+    },
+    {
+      "entropy": 0.6920944279432297,
+      "epoch": 1.1140819964349375,
+      "grad_norm": 0.8023701310157776,
+      "learning_rate": 1.1297164495634069e-05,
+      "loss": 0.6772218322753907,
+      "mean_token_accuracy": 0.8343758553266525,
+      "num_tokens": 9137160.0,
+      "step": 625
+    },
+    {
+      "entropy": 0.67285136282444,
+      "epoch": 1.1586452762923352,
+      "grad_norm": 0.7788256406784058,
+      "learning_rate": 1.0971730048315917e-05,
+      "loss": 0.6581203460693359,
+      "mean_token_accuracy": 0.8390156370401383,
+      "num_tokens": 9505580.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.6888180702924729,
+      "epoch": 1.2032085561497325,
+      "grad_norm": 0.8268939256668091,
+      "learning_rate": 1.0637751334391775e-05,
+      "loss": 0.673553466796875,
+      "mean_token_accuracy": 0.8359775388240814,
+      "num_tokens": 9868570.0,
+      "step": 675
+    },
+    {
+      "entropy": 0.6915264892578125,
+      "epoch": 1.2477718360071302,
+      "grad_norm": 0.8361654877662659,
+      "learning_rate": 1.0296050308084114e-05,
+      "loss": 0.6790201568603516,
+      "mean_token_accuracy": 0.8342142343521118,
+      "num_tokens": 10229373.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.6885707491636276,
+      "epoch": 1.2923351158645278,
+      "grad_norm": 0.7386716604232788,
+      "learning_rate": 9.94746792898014e-06,
+      "loss": 0.6720596313476562,
+      "mean_token_accuracy": 0.8353542894124985,
+      "num_tokens": 10595419.0,
+      "step": 725
+    },
+    {
+      "entropy": 0.6660267195105553,
+      "epoch": 1.3368983957219251,
+      "grad_norm": 0.7973800897598267,
+      "learning_rate": 9.59286209234813e-06,
+      "loss": 0.6550118255615235,
+      "mean_token_accuracy": 0.8386269718408584,
+      "num_tokens": 10960517.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.6469692060351372,
+      "epoch": 1.3814616755793225,
+      "grad_norm": 0.798152506351471,
+      "learning_rate": 9.233105517773445e-06,
+      "loss": 0.6308420181274415,
+      "mean_token_accuracy": 0.8429271316528321,
+      "num_tokens": 11328702.0,
+      "step": 775
+    },
+    {
+      "entropy": 0.6708013540506363,
+      "epoch": 1.4260249554367201,
+      "grad_norm": 0.9537823796272278,
+      "learning_rate": 8.869083601310398e-06,
+      "loss": 0.6537622833251953,
+      "mean_token_accuracy": 0.838316883444786,
+      "num_tokens": 11697546.0,
+      "step": 800
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "eval_entropy": 0.670824561715126,
+      "eval_loss": 0.6723578572273254,
+      "eval_mean_token_accuracy": 0.8353032109737396,
+      "eval_num_tokens": 11697546.0,
+      "eval_runtime": 74.7664,
+      "eval_samples_per_second": 13.335,
+      "eval_steps_per_second": 3.344,
+      "step": 800
+    },
+    {
+      "entropy": 0.659270493388176,
+      "epoch": 1.4705882352941178,
+      "grad_norm": 0.846034586429596,
+      "learning_rate": 8.501692236436132e-06,
+      "loss": 0.6444293212890625,
+      "mean_token_accuracy": 0.8404667204618455,
+      "num_tokens": 12061827.0,
+      "step": 825
+    },
+    {
+      "entropy": 0.6627422112226486,
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.9181033968925476,
+      "learning_rate": 8.131835609169295e-06,
+      "loss": 0.6494012451171876,
+      "mean_token_accuracy": 0.839583694934845,
+      "num_tokens": 12427853.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.6641036707162857,
+      "epoch": 1.5597147950089125,
+      "grad_norm": 0.858001172542572,
+      "learning_rate": 7.760423972779985e-06,
+      "loss": 0.6495742797851562,
+      "mean_token_accuracy": 0.8395592844486237,
+      "num_tokens": 12799973.0,
+      "step": 875
+    },
+    {
+      "entropy": 0.6689085793495179,
+      "epoch": 1.6042780748663101,
+      "grad_norm": 0.8615349531173706,
+      "learning_rate": 7.388371407567565e-06,
+      "loss": 0.6532559967041016,
+      "mean_token_accuracy": 0.8388407498598098,
+      "num_tokens": 13166796.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.6729245400428772,
+      "epoch": 1.6488413547237077,
+      "grad_norm": 0.831142783164978,
+      "learning_rate": 7.01659357121981e-06,
+      "loss": 0.6572090911865235,
+      "mean_token_accuracy": 0.8372052818536758,
+      "num_tokens": 13532499.0,
+      "step": 925
+    },
+    {
+      "entropy": 0.6538485777378082,
+      "epoch": 1.6934046345811051,
+      "grad_norm": 0.919346809387207,
+      "learning_rate": 6.6460054452899315e-06,
+      "loss": 0.6404708862304688,
+      "mean_token_accuracy": 0.8411308795213699,
+      "num_tokens": 13898404.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.6691750481724739,
+      "epoch": 1.7379679144385025,
+      "grad_norm": 0.9280221462249756,
+      "learning_rate": 6.277519083337656e-06,
+      "loss": 0.6546466827392579,
+      "mean_token_accuracy": 0.838825848698616,
+      "num_tokens": 14261658.0,
+      "step": 975
+    },
+    {
+      "entropy": 0.6536609560251236,
+      "epoch": 1.7825311942959001,
+      "grad_norm": 0.9000495076179504,
+      "learning_rate": 5.9120413662763545e-06,
+      "loss": 0.6405950927734375,
+      "mean_token_accuracy": 0.8412596487998962,
+      "num_tokens": 14625008.0,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "eval_entropy": 0.6716028243303299,
+      "eval_loss": 0.6561057567596436,
+      "eval_mean_token_accuracy": 0.8381222817897797,
+      "eval_num_tokens": 14625008.0,
+      "eval_runtime": 74.7617,
+      "eval_samples_per_second": 13.336,
+      "eval_steps_per_second": 3.344,
+      "step": 1000
+    },
+    {
+      "entropy": 0.6671841683983802,
+      "epoch": 1.8270944741532977,
+      "grad_norm": 0.8711400628089905,
+      "learning_rate": 5.550471770450572e-06,
+      "loss": 0.6500684356689453,
+      "mean_token_accuracy": 0.8389109486341476,
+      "num_tokens": 14985559.0,
+      "step": 1025
+    },
+    {
+      "entropy": 0.6568678751587868,
+      "epoch": 1.8716577540106951,
+      "grad_norm": 0.9135516285896301,
+      "learning_rate": 5.193700153936934e-06,
+      "loss": 0.6418634033203126,
+      "mean_token_accuracy": 0.8414819967746735,
+      "num_tokens": 15354311.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.6430006143450737,
+      "epoch": 1.9162210338680927,
+      "grad_norm": 0.9346958994865417,
+      "learning_rate": 4.842604566516537e-06,
+      "loss": 0.6278348541259766,
+      "mean_token_accuracy": 0.8434987276792526,
+      "num_tokens": 15721382.0,
+      "step": 1075
+    },
+    {
+      "entropy": 0.6387567144632339,
+      "epoch": 1.9607843137254903,
+      "grad_norm": 0.9693854451179504,
+      "learning_rate": 4.498049088708706e-06,
+      "loss": 0.6229427337646485,
+      "mean_token_accuracy": 0.8442350590229034,
+      "num_tokens": 16088038.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.6434592244029045,
+      "epoch": 2.0053475935828877,
+      "grad_norm": 0.9158383011817932,
+      "learning_rate": 4.160881705184478e-06,
+      "loss": 0.6287346649169921,
+      "mean_token_accuracy": 0.8434397971630097,
+      "num_tokens": 16448228.0,
+      "step": 1125
+    },
+    {
+      "entropy": 0.6293540370464324,
+      "epoch": 2.049910873440285,
+      "grad_norm": 0.9278510808944702,
+      "learning_rate": 3.831932217793526e-06,
+      "loss": 0.6089762115478515,
+      "mean_token_accuracy": 0.8473779886960984,
+      "num_tokens": 16812866.0,
+      "step": 1150
+    },
+    {
+      "entropy": 0.6246551343798638,
+      "epoch": 2.0944741532976825,
+      "grad_norm": 0.8729245066642761,
+      "learning_rate": 3.5120102033408053e-06,
+      "loss": 0.6066710281372071,
+      "mean_token_accuracy": 0.8471958756446838,
+      "num_tokens": 17177909.0,
+      "step": 1175
+    },
+    {
+      "entropy": 0.6269071605801583,
+      "epoch": 2.1390374331550803,
+      "grad_norm": 0.8709802031517029,
+      "learning_rate": 3.201903021138983e-06,
+      "loss": 0.6111587905883789,
+      "mean_token_accuracy": 0.8464664667844772,
+      "num_tokens": 17544377.0,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1390374331550803,
+      "eval_entropy": 0.6344557646512985,
+      "eval_loss": 0.6462315320968628,
+      "eval_mean_token_accuracy": 0.8403205525875092,
+      "eval_num_tokens": 17544377.0,
+      "eval_runtime": 74.8344,
+      "eval_samples_per_second": 13.323,
+      "eval_steps_per_second": 3.341,
+      "step": 1200
+    },
+    {
+      "entropy": 0.617467094361782,
+      "epoch": 2.1836007130124777,
+      "grad_norm": 0.8771170973777771,
+      "learning_rate": 2.9023738752403013e-06,
+      "loss": 0.5986224746704102,
+      "mean_token_accuracy": 0.849560460448265,
+      "num_tokens": 17912855.0,
+      "step": 1225
+    },
+    {
+      "entropy": 0.6177873882651329,
+      "epoch": 2.228163992869875,
+      "grad_norm": 1.0253841876983643,
+      "learning_rate": 2.614159936116893e-06,
+      "loss": 0.5998103332519531,
+      "mean_token_accuracy": 0.8487882578372955,
+      "num_tokens": 18279476.0,
+      "step": 1250
+    },
+    {
+      "entropy": 0.6312283331155777,
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.9465038180351257,
+      "learning_rate": 2.337970526412267e-06,
+      "loss": 0.6118741226196289,
+      "mean_token_accuracy": 0.8458875006437302,
+      "num_tokens": 18644269.0,
+      "step": 1275
+    },
+    {
+      "entropy": 0.6209010258316994,
+      "epoch": 2.3172905525846703,
+      "grad_norm": 0.9807332158088684,
+      "learning_rate": 2.074485375229037e-06,
+      "loss": 0.6052029037475586,
+      "mean_token_accuracy": 0.8471564346551895,
+      "num_tokens": 19009107.0,
+      "step": 1300
+    },
+    {
+      "entropy": 0.6401337105035781,
+      "epoch": 2.3618538324420677,
+      "grad_norm": 1.0486506223678589,
+      "learning_rate": 1.82435294524924e-06,
+      "loss": 0.6207434463500977,
+      "mean_token_accuracy": 0.8439285135269166,
+      "num_tokens": 19374349.0,
+      "step": 1325
+    },
+    {
+      "entropy": 0.6109014016389847,
+      "epoch": 2.406417112299465,
+      "grad_norm": 0.9694714546203613,
+      "learning_rate": 1.5881888368043559e-06,
+      "loss": 0.5924215316772461,
+      "mean_token_accuracy": 0.8494464015960693,
+      "num_tokens": 19743047.0,
+      "step": 1350
+    },
+    {
+      "entropy": 0.6300237196683883,
+      "epoch": 2.450980392156863,
+      "grad_norm": 0.9961308836936951,
+      "learning_rate": 1.3665742728227932e-06,
+      "loss": 0.6133406066894531,
+      "mean_token_accuracy": 0.8462675029039383,
+      "num_tokens": 20105853.0,
+      "step": 1375
+    },
+    {
+      "entropy": 0.6148158556222916,
+      "epoch": 2.4955436720142603,
+      "grad_norm": 1.0224037170410156,
+      "learning_rate": 1.1600546683835065e-06,
+      "loss": 0.5978146362304687,
+      "mean_token_accuracy": 0.8488863033056259,
+      "num_tokens": 20469876.0,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4955436720142603,
+      "eval_entropy": 0.627735008597374,
+      "eval_loss": 0.6408645510673523,
+      "eval_mean_token_accuracy": 0.8411034562587738,
+      "eval_num_tokens": 20469876.0,
+      "eval_runtime": 74.7955,
+      "eval_samples_per_second": 13.33,
+      "eval_steps_per_second": 3.342,
+      "step": 1400
+    },
+    {
+      "entropy": 0.6221208718419075,
+      "epoch": 2.5401069518716577,
+      "grad_norm": 1.0483691692352295,
+      "learning_rate": 9.691382883962515e-07,
+      "loss": 0.6043234634399414,
+      "mean_token_accuracy": 0.8475923782587051,
+      "num_tokens": 20834908.0,
+      "step": 1425
+    },
+    {
+      "entropy": 0.6163172733783722,
+      "epoch": 2.5846702317290555,
+      "grad_norm": 1.0169743299484253,
+      "learning_rate": 7.942949967120098e-07,
+      "loss": 0.6007443237304687,
+      "mean_token_accuracy": 0.8487154805660247,
+      "num_tokens": 21199575.0,
+      "step": 1450
+    },
+    {
+      "entropy": 0.6306376928091049,
+      "epoch": 2.629233511586453,
+      "grad_norm": 0.9749926328659058,
+      "learning_rate": 6.359550997421698e-07,
+      "loss": 0.6101107406616211,
+      "mean_token_accuracy": 0.8469714081287384,
+      "num_tokens": 21564414.0,
+      "step": 1475
+    },
+    {
+      "entropy": 0.6135326558351517,
+      "epoch": 2.6737967914438503,
+      "grad_norm": 1.0116835832595825,
+      "learning_rate": 4.945082874324541e-07,
+      "loss": 0.5956003189086914,
+      "mean_token_accuracy": 0.8500852519273758,
+      "num_tokens": 21928080.0,
+      "step": 1500
+    },
+    {
+      "entropy": 0.6165187922120094,
+      "epoch": 2.7183600713012477,
+      "grad_norm": 0.9928510785102844,
+      "learning_rate": 3.7030267419789764e-07,
+      "loss": 0.6013864135742187,
+      "mean_token_accuracy": 0.8494158619642258,
+      "num_tokens": 22296207.0,
+      "step": 1525
+    },
+    {
+      "entropy": 0.619775217473507,
+      "epoch": 2.762923351158645,
+      "grad_norm": 0.9901552796363831,
+      "learning_rate": 2.6364394217929856e-07,
+      "loss": 0.6034153366088867,
+      "mean_token_accuracy": 0.8480428576469421,
+      "num_tokens": 22661645.0,
+      "step": 1550
+    },
+    {
+      "entropy": 0.6086369237303734,
+      "epoch": 2.807486631016043,
+      "grad_norm": 0.8772838711738586,
+      "learning_rate": 1.7479458892961846e-07,
+      "loss": 0.5885520553588868,
+      "mean_token_accuracy": 0.8515380412340164,
+      "num_tokens": 23028234.0,
+      "step": 1575
+    },
+    {
+      "entropy": 0.6151553666591645,
+      "epoch": 2.8520499108734403,
+      "grad_norm": 0.9948622584342957,
+      "learning_rate": 1.0397328138187557e-07,
+      "loss": 0.5963270568847656,
+      "mean_token_accuracy": 0.8506602907180786,
+      "num_tokens": 23393799.0,
+      "step": 1600
+    },
+    {
+      "epoch": 2.8520499108734403,
+      "eval_entropy": 0.626643338561058,
+      "eval_loss": 0.6390902400016785,
+      "eval_mean_token_accuracy": 0.8414715526103973,
+      "eval_num_tokens": 23393799.0,
+      "eval_runtime": 74.7969,
+      "eval_samples_per_second": 13.329,
+      "eval_steps_per_second": 3.342,
+      "step": 1600
+    },
+    {
+      "entropy": 0.6176378938555718,
+      "epoch": 2.8966131907308377,
+      "grad_norm": 0.9002705216407776,
+      "learning_rate": 5.135431768847676e-08,
+      "loss": 0.5993848037719727,
+      "mean_token_accuracy": 0.8487631809711457,
+      "num_tokens": 23759121.0,
+      "step": 1625
+    },
+    {
+      "entropy": 0.6263965710997581,
+      "epoch": 2.9411764705882355,
+      "grad_norm": 0.9609607458114624,
+      "learning_rate": 1.7067198256442428e-08,
+      "loss": 0.6067921829223633,
+      "mean_token_accuracy": 0.8475446420907974,
+      "num_tokens": 24123974.0,
+      "step": 1650
+    },
+    {
+      "entropy": 0.6194088864326477,
+      "epoch": 2.985739750445633,
+      "grad_norm": 0.9630091190338135,
+      "learning_rate": 1.1963070342654869e-09,
+      "loss": 0.602127571105957,
+      "mean_token_accuracy": 0.8485302919149399,
+      "num_tokens": 24488475.0,
+      "step": 1675
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 1683,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1863540916795904e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1683/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
+size 5585

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
+size 11421892

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33ef6c419df9522b8fc114339f82c3e390d52dabd314191546cdbca2a738c0af
+size 5585

training_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "model": "Qwen/Qwen2.5-7B-Instruct",
+  "lora_rank": 64,
+  "lora_alpha": 128,
+  "epochs": 3,
+  "batch": 16,
+  "lr": 1.5e-05,
+  "train_size": 8973,
+  "val_size": 997,
+  "quant": "none (bf16)"
+}