agu18dec commited on 26 days ago

Commit

d0d7f35

verified ·

1 Parent(s): 4687d28

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/README.md +61 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/trainer_state.json +1114 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/trainer_state.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/trainer_state.json +2194 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -578,3 +578,14 @@ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/chec
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/checkpoint-8368/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/checkpoint-9414/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/checkpoint-8368/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/checkpoint-9414/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a2_B1_L20_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-4324/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-5405/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-6486/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-7567/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-8648/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-9729/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/qwdcxorf)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.19.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.9.1
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d024b67dfcd2b3be75af3799cf96f3b78068a30f8e4ebd4c5a1ce83bf2539173
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42f9e7287e13eba59c98489767ff96fc23b3ba999ce570f511a42337333223fd
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1114 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1081,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2675078868865968,
+      "epoch": 0.009250693802035153,
+      "grad_norm": 5.175303936004639,
+      "learning_rate": 1.6635859519408503e-06,
+      "loss": 1.3297,
+      "mean_token_accuracy": 0.6369022190570831,
+      "num_tokens": 13606.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2817180395126342,
+      "epoch": 0.018501387604070305,
+      "grad_norm": 4.235851287841797,
+      "learning_rate": 3.512014787430684e-06,
+      "loss": 1.3307,
+      "mean_token_accuracy": 0.6350878715515137,
+      "num_tokens": 27217.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.262493872642517,
+      "epoch": 0.027752081406105456,
+      "grad_norm": 5.058028221130371,
+      "learning_rate": 5.360443622920518e-06,
+      "loss": 1.2136,
+      "mean_token_accuracy": 0.6545175433158874,
+      "num_tokens": 40014.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2985574364662171,
+      "epoch": 0.03700277520814061,
+      "grad_norm": 3.649226665496826,
+      "learning_rate": 7.2088724584103514e-06,
+      "loss": 1.266,
+      "mean_token_accuracy": 0.6147747755050659,
+      "num_tokens": 53225.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.314147698879242,
+      "epoch": 0.04625346901017576,
+      "grad_norm": 1.8768671751022339,
+      "learning_rate": 9.057301293900185e-06,
+      "loss": 1.0651,
+      "mean_token_accuracy": 0.6504510939121246,
+      "num_tokens": 66702.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.3419164538383483,
+      "epoch": 0.05550416281221091,
+      "grad_norm": 1.107013463973999,
+      "learning_rate": 1.0905730129390019e-05,
+      "loss": 0.9847,
+      "mean_token_accuracy": 0.6611938953399659,
+      "num_tokens": 79234.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.3964227795600892,
+      "epoch": 0.06475485661424607,
+      "grad_norm": 1.1377882957458496,
+      "learning_rate": 1.2754158964879853e-05,
+      "loss": 1.0259,
+      "mean_token_accuracy": 0.6437336921691894,
+      "num_tokens": 91832.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.4071043848991394,
+      "epoch": 0.07400555041628122,
+      "grad_norm": 1.1729836463928223,
+      "learning_rate": 1.4602587800369685e-05,
+      "loss": 0.9942,
+      "mean_token_accuracy": 0.6535955727100372,
+      "num_tokens": 104786.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.3836156249046325,
+      "epoch": 0.08325624421831637,
+      "grad_norm": 1.2727314233779907,
+      "learning_rate": 1.645101663585952e-05,
+      "loss": 0.9512,
+      "mean_token_accuracy": 0.6672091960906983,
+      "num_tokens": 117713.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.3533575654029846,
+      "epoch": 0.09250693802035152,
+      "grad_norm": 1.0665016174316406,
+      "learning_rate": 1.8299445471349355e-05,
+      "loss": 0.9175,
+      "mean_token_accuracy": 0.6751855850219727,
+      "num_tokens": 130661.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.3368927597999574,
+      "epoch": 0.10175763182238667,
+      "grad_norm": 0.7565245032310486,
+      "learning_rate": 2.0147874306839186e-05,
+      "loss": 0.8875,
+      "mean_token_accuracy": 0.6832059919834137,
+      "num_tokens": 144367.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.3387869954109193,
+      "epoch": 0.11100832562442182,
+      "grad_norm": 0.8816360831260681,
+      "learning_rate": 2.1996303142329023e-05,
+      "loss": 0.8901,
+      "mean_token_accuracy": 0.6898956418037414,
+      "num_tokens": 158098.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.3420647621154784,
+      "epoch": 0.12025901942645699,
+      "grad_norm": 0.9925475716590881,
+      "learning_rate": 2.3844731977818853e-05,
+      "loss": 0.922,
+      "mean_token_accuracy": 0.674148154258728,
+      "num_tokens": 171386.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.3588476538658143,
+      "epoch": 0.12950971322849214,
+      "grad_norm": 1.2224054336547852,
+      "learning_rate": 2.5693160813308687e-05,
+      "loss": 0.9012,
+      "mean_token_accuracy": 0.6888944447040558,
+      "num_tokens": 184011.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.3528849005699157,
+      "epoch": 0.13876040703052728,
+      "grad_norm": 0.8460184931755066,
+      "learning_rate": 2.754158964879852e-05,
+      "loss": 0.9482,
+      "mean_token_accuracy": 0.6652046918869019,
+      "num_tokens": 197201.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.3713510990142823,
+      "epoch": 0.14801110083256244,
+      "grad_norm": 1.1473662853240967,
+      "learning_rate": 2.9390018484288358e-05,
+      "loss": 0.9201,
+      "mean_token_accuracy": 0.6809458076953888,
+      "num_tokens": 209879.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.4071080327033996,
+      "epoch": 0.1572617946345976,
+      "grad_norm": 0.9973745942115784,
+      "learning_rate": 3.123844731977819e-05,
+      "loss": 0.98,
+      "mean_token_accuracy": 0.6592511236667633,
+      "num_tokens": 223079.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.3867051601409912,
+      "epoch": 0.16651248843663274,
+      "grad_norm": 1.2232855558395386,
+      "learning_rate": 3.308687615526803e-05,
+      "loss": 0.9706,
+      "mean_token_accuracy": 0.6540988445281982,
+      "num_tokens": 236579.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.3836822986602784,
+      "epoch": 0.1757631822386679,
+      "grad_norm": 1.4032989740371704,
+      "learning_rate": 3.493530499075786e-05,
+      "loss": 0.9215,
+      "mean_token_accuracy": 0.6792983472347259,
+      "num_tokens": 249412.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.3905784249305726,
+      "epoch": 0.18501387604070305,
+      "grad_norm": 1.32985520362854,
+      "learning_rate": 3.678373382624769e-05,
+      "loss": 0.9527,
+      "mean_token_accuracy": 0.6650208413600922,
+      "num_tokens": 262063.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.37818124294281,
+      "epoch": 0.1942645698427382,
+      "grad_norm": 1.0119655132293701,
+      "learning_rate": 3.8632162661737526e-05,
+      "loss": 0.914,
+      "mean_token_accuracy": 0.683707457780838,
+      "num_tokens": 275794.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.4193374395370484,
+      "epoch": 0.20351526364477335,
+      "grad_norm": 0.9579392671585083,
+      "learning_rate": 4.048059149722736e-05,
+      "loss": 0.9657,
+      "mean_token_accuracy": 0.6614095211029053,
+      "num_tokens": 289044.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.3844568610191346,
+      "epoch": 0.2127659574468085,
+      "grad_norm": 1.020785927772522,
+      "learning_rate": 4.232902033271719e-05,
+      "loss": 0.8909,
+      "mean_token_accuracy": 0.6828382730484008,
+      "num_tokens": 302504.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.3760874032974244,
+      "epoch": 0.22201665124884365,
+      "grad_norm": 0.9178677201271057,
+      "learning_rate": 4.4177449168207024e-05,
+      "loss": 0.7948,
+      "mean_token_accuracy": 0.7197233438491821,
+      "num_tokens": 315957.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.3967938780784608,
+      "epoch": 0.23126734505087881,
+      "grad_norm": 1.388061285018921,
+      "learning_rate": 4.602587800369686e-05,
+      "loss": 0.9133,
+      "mean_token_accuracy": 0.6771849095821381,
+      "num_tokens": 328806.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.4414387702941895,
+      "epoch": 0.24051803885291398,
+      "grad_norm": 0.9865493774414062,
+      "learning_rate": 4.787430683918669e-05,
+      "loss": 0.9665,
+      "mean_token_accuracy": 0.6627395570278167,
+      "num_tokens": 342048.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.4030232906341553,
+      "epoch": 0.24976873265494912,
+      "grad_norm": 1.1840286254882812,
+      "learning_rate": 4.972273567467653e-05,
+      "loss": 0.9092,
+      "mean_token_accuracy": 0.6842468678951263,
+      "num_tokens": 354934.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.4368361949920654,
+      "epoch": 0.2590194264569843,
+      "grad_norm": 1.1045658588409424,
+      "learning_rate": 5.157116451016636e-05,
+      "loss": 0.9749,
+      "mean_token_accuracy": 0.6620967447757721,
+      "num_tokens": 367651.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.4251260161399841,
+      "epoch": 0.2682701202590194,
+      "grad_norm": 0.9880494475364685,
+      "learning_rate": 5.3419593345656196e-05,
+      "loss": 0.9046,
+      "mean_token_accuracy": 0.6790169060230256,
+      "num_tokens": 381015.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.4615792393684388,
+      "epoch": 0.27752081406105455,
+      "grad_norm": 1.1569000482559204,
+      "learning_rate": 5.5268022181146026e-05,
+      "loss": 1.0025,
+      "mean_token_accuracy": 0.644273191690445,
+      "num_tokens": 393696.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.4423722743988037,
+      "epoch": 0.28677150786308975,
+      "grad_norm": 1.2215638160705566,
+      "learning_rate": 5.711645101663586e-05,
+      "loss": 0.9301,
+      "mean_token_accuracy": 0.6762896001338958,
+      "num_tokens": 407205.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.431472909450531,
+      "epoch": 0.2960222016651249,
+      "grad_norm": 0.7539701461791992,
+      "learning_rate": 5.89648798521257e-05,
+      "loss": 0.9627,
+      "mean_token_accuracy": 0.6622189521789551,
+      "num_tokens": 420423.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.4409304022789002,
+      "epoch": 0.30527289546716,
+      "grad_norm": 1.0752846002578735,
+      "learning_rate": 6.081330868761553e-05,
+      "loss": 0.9608,
+      "mean_token_accuracy": 0.6622848808765411,
+      "num_tokens": 433792.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.4193753361701966,
+      "epoch": 0.3145235892691952,
+      "grad_norm": 1.0436838865280151,
+      "learning_rate": 6.266173752310537e-05,
+      "loss": 0.9149,
+      "mean_token_accuracy": 0.678329062461853,
+      "num_tokens": 447028.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.474712109565735,
+      "epoch": 0.32377428307123035,
+      "grad_norm": 0.8787930011749268,
+      "learning_rate": 6.45101663585952e-05,
+      "loss": 1.0224,
+      "mean_token_accuracy": 0.639413845539093,
+      "num_tokens": 460030.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.446873426437378,
+      "epoch": 0.3330249768732655,
+      "grad_norm": 1.028443455696106,
+      "learning_rate": 6.635859519408502e-05,
+      "loss": 0.9809,
+      "mean_token_accuracy": 0.652896037697792,
+      "num_tokens": 473511.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.4507364869117736,
+      "epoch": 0.3422756706753006,
+      "grad_norm": 0.9386515617370605,
+      "learning_rate": 6.820702402957486e-05,
+      "loss": 0.9594,
+      "mean_token_accuracy": 0.6635801315307617,
+      "num_tokens": 487056.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.439187967777252,
+      "epoch": 0.3515263644773358,
+      "grad_norm": 1.075317144393921,
+      "learning_rate": 7.00554528650647e-05,
+      "loss": 0.9509,
+      "mean_token_accuracy": 0.6723007440567017,
+      "num_tokens": 500910.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.4200530290603637,
+      "epoch": 0.36077705827937095,
+      "grad_norm": 0.821780800819397,
+      "learning_rate": 7.190388170055453e-05,
+      "loss": 0.8318,
+      "mean_token_accuracy": 0.7095312774181366,
+      "num_tokens": 513491.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.4265631794929505,
+      "epoch": 0.3700277520814061,
+      "grad_norm": 1.0542839765548706,
+      "learning_rate": 7.375231053604437e-05,
+      "loss": 0.906,
+      "mean_token_accuracy": 0.6781497061252594,
+      "num_tokens": 526457.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.4734636306762696,
+      "epoch": 0.3792784458834413,
+      "grad_norm": 0.8997613787651062,
+      "learning_rate": 7.56007393715342e-05,
+      "loss": 0.9399,
+      "mean_token_accuracy": 0.6672972977161408,
+      "num_tokens": 539654.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.4279640436172485,
+      "epoch": 0.3885291396854764,
+      "grad_norm": 1.0399622917175293,
+      "learning_rate": 7.744916820702403e-05,
+      "loss": 0.8637,
+      "mean_token_accuracy": 0.6964247941970825,
+      "num_tokens": 552489.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.4684167385101319,
+      "epoch": 0.39777983348751156,
+      "grad_norm": 0.8020175695419312,
+      "learning_rate": 7.929759704251387e-05,
+      "loss": 0.9586,
+      "mean_token_accuracy": 0.6565007865428925,
+      "num_tokens": 565491.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.474439013004303,
+      "epoch": 0.4070305272895467,
+      "grad_norm": 1.190341591835022,
+      "learning_rate": 8.114602587800369e-05,
+      "loss": 0.9494,
+      "mean_token_accuracy": 0.658382123708725,
+      "num_tokens": 579034.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.5036101937294006,
+      "epoch": 0.4162812210915819,
+      "grad_norm": 0.981868326663971,
+      "learning_rate": 8.299445471349353e-05,
+      "loss": 1.0627,
+      "mean_token_accuracy": 0.6313726782798768,
+      "num_tokens": 591893.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.4632315397262574,
+      "epoch": 0.425531914893617,
+      "grad_norm": 1.076858639717102,
+      "learning_rate": 8.484288354898337e-05,
+      "loss": 0.9065,
+      "mean_token_accuracy": 0.6824360251426697,
+      "num_tokens": 604649.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.4752267003059387,
+      "epoch": 0.43478260869565216,
+      "grad_norm": 0.8257336020469666,
+      "learning_rate": 8.66913123844732e-05,
+      "loss": 0.9493,
+      "mean_token_accuracy": 0.6698306739330292,
+      "num_tokens": 617820.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.4853222370147705,
+      "epoch": 0.4440333024976873,
+      "grad_norm": 0.8594188094139099,
+      "learning_rate": 8.853974121996304e-05,
+      "loss": 0.9786,
+      "mean_token_accuracy": 0.6549507141113281,
+      "num_tokens": 630699.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.464365589618683,
+      "epoch": 0.4532839962997225,
+      "grad_norm": 1.0990275144577026,
+      "learning_rate": 9.038817005545288e-05,
+      "loss": 0.8889,
+      "mean_token_accuracy": 0.6922601819038391,
+      "num_tokens": 644127.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.489177119731903,
+      "epoch": 0.46253469010175763,
+      "grad_norm": 0.9509454369544983,
+      "learning_rate": 9.22365988909427e-05,
+      "loss": 0.9426,
+      "mean_token_accuracy": 0.6659547984600067,
+      "num_tokens": 657148.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.4962970852851867,
+      "epoch": 0.47178538390379277,
+      "grad_norm": 1.2484740018844604,
+      "learning_rate": 9.408502772643254e-05,
+      "loss": 0.953,
+      "mean_token_accuracy": 0.6586906611919403,
+      "num_tokens": 670505.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.4610695004463197,
+      "epoch": 0.48103607770582796,
+      "grad_norm": 1.038205862045288,
+      "learning_rate": 9.593345656192237e-05,
+      "loss": 0.8792,
+      "mean_token_accuracy": 0.695543521642685,
+      "num_tokens": 683406.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.543120539188385,
+      "epoch": 0.4902867715078631,
+      "grad_norm": 0.6887747645378113,
+      "learning_rate": 9.778188539741221e-05,
+      "loss": 1.0731,
+      "mean_token_accuracy": 0.6240392029285431,
+      "num_tokens": 695965.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.5109227538108825,
+      "epoch": 0.49953746530989823,
+      "grad_norm": 1.0471116304397583,
+      "learning_rate": 9.963031423290203e-05,
+      "loss": 0.8617,
+      "mean_token_accuracy": 0.6985665023326874,
+      "num_tokens": 709172.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.4863034248352052,
+      "epoch": 0.5087881591119334,
+      "grad_norm": 0.9132720828056335,
+      "learning_rate": 9.999985025125081e-05,
+      "loss": 0.8525,
+      "mean_token_accuracy": 0.7047226369380951,
+      "num_tokens": 722271.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.517179274559021,
+      "epoch": 0.5180388529139686,
+      "grad_norm": 0.8982908129692078,
+      "learning_rate": 9.999924189849452e-05,
+      "loss": 0.9545,
+      "mean_token_accuracy": 0.6619311034679413,
+      "num_tokens": 735490.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.5107035517692566,
+      "epoch": 0.5272895467160037,
+      "grad_norm": 1.1028233766555786,
+      "learning_rate": 9.99981655881237e-05,
+      "loss": 1.008,
+      "mean_token_accuracy": 0.6386784017086029,
+      "num_tokens": 748441.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.4935171365737916,
+      "epoch": 0.5365402405180388,
+      "grad_norm": 1.5201911926269531,
+      "learning_rate": 9.999662133021185e-05,
+      "loss": 0.9209,
+      "mean_token_accuracy": 0.6722200810909271,
+      "num_tokens": 761485.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.5198947191238403,
+      "epoch": 0.545790934320074,
+      "grad_norm": 1.154875636100769,
+      "learning_rate": 9.999460913921211e-05,
+      "loss": 0.9829,
+      "mean_token_accuracy": 0.6530335664749145,
+      "num_tokens": 775045.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.5305540323257447,
+      "epoch": 0.5550416281221091,
+      "grad_norm": 0.9236158132553101,
+      "learning_rate": 9.99921290339572e-05,
+      "loss": 1.0093,
+      "mean_token_accuracy": 0.6421272337436676,
+      "num_tokens": 787827.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.5291874051094054,
+      "epoch": 0.5642923219241444,
+      "grad_norm": 1.0652803182601929,
+      "learning_rate": 9.998918103765914e-05,
+      "loss": 0.927,
+      "mean_token_accuracy": 0.6654485881328582,
+      "num_tokens": 801418.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.550955057144165,
+      "epoch": 0.5735430157261795,
+      "grad_norm": 0.9596770405769348,
+      "learning_rate": 9.99857651779091e-05,
+      "loss": 1.0393,
+      "mean_token_accuracy": 0.6354499399662018,
+      "num_tokens": 814457.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.501573884487152,
+      "epoch": 0.5827937095282146,
+      "grad_norm": 0.9417737722396851,
+      "learning_rate": 9.998188148667708e-05,
+      "loss": 0.8915,
+      "mean_token_accuracy": 0.6816763877868652,
+      "num_tokens": 827714.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.4670620203018188,
+      "epoch": 0.5920444033302498,
+      "grad_norm": 1.3990159034729004,
+      "learning_rate": 9.997753000031175e-05,
+      "loss": 0.8596,
+      "mean_token_accuracy": 0.6988556385040283,
+      "num_tokens": 840400.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.528640353679657,
+      "epoch": 0.6012950971322849,
+      "grad_norm": 1.0697306394577026,
+      "learning_rate": 9.997271075953994e-05,
+      "loss": 0.9713,
+      "mean_token_accuracy": 0.6468679904937744,
+      "num_tokens": 853028.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.5169320225715637,
+      "epoch": 0.61054579093432,
+      "grad_norm": 1.1989524364471436,
+      "learning_rate": 9.996742380946628e-05,
+      "loss": 0.9643,
+      "mean_token_accuracy": 0.6630256116390228,
+      "num_tokens": 866211.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.486118495464325,
+      "epoch": 0.6197964847363552,
+      "grad_norm": 1.3864704370498657,
+      "learning_rate": 9.996166919957297e-05,
+      "loss": 0.9255,
+      "mean_token_accuracy": 0.6708253383636474,
+      "num_tokens": 879248.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.5209161520004273,
+      "epoch": 0.6290471785383904,
+      "grad_norm": 0.8406121134757996,
+      "learning_rate": 9.995544698371904e-05,
+      "loss": 0.9067,
+      "mean_token_accuracy": 0.6848369777202606,
+      "num_tokens": 892532.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.5373739361763001,
+      "epoch": 0.6382978723404256,
+      "grad_norm": 1.0999748706817627,
+      "learning_rate": 9.994875722014008e-05,
+      "loss": 1.014,
+      "mean_token_accuracy": 0.6408933758735657,
+      "num_tokens": 905340.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.516480839252472,
+      "epoch": 0.6475485661424607,
+      "grad_norm": 0.9417734742164612,
+      "learning_rate": 9.994159997144752e-05,
+      "loss": 0.9749,
+      "mean_token_accuracy": 0.6582231283187866,
+      "num_tokens": 918565.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.5324442982673645,
+      "epoch": 0.6567992599444958,
+      "grad_norm": 0.9670525193214417,
+      "learning_rate": 9.993397530462818e-05,
+      "loss": 0.9952,
+      "mean_token_accuracy": 0.6441864192485809,
+      "num_tokens": 931590.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.5277332067489624,
+      "epoch": 0.666049953746531,
+      "grad_norm": 1.1460349559783936,
+      "learning_rate": 9.992588329104354e-05,
+      "loss": 0.9238,
+      "mean_token_accuracy": 0.677095913887024,
+      "num_tokens": 945000.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.5624989986419677,
+      "epoch": 0.6753006475485661,
+      "grad_norm": 0.9031643867492676,
+      "learning_rate": 9.991732400642916e-05,
+      "loss": 0.9745,
+      "mean_token_accuracy": 0.6575322329998017,
+      "num_tokens": 958208.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.5397494316101075,
+      "epoch": 0.6845513413506013,
+      "grad_norm": 0.9962924122810364,
+      "learning_rate": 9.990829753089389e-05,
+      "loss": 0.9186,
+      "mean_token_accuracy": 0.6840661704540253,
+      "num_tokens": 971666.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.5536072373390197,
+      "epoch": 0.6938020351526365,
+      "grad_norm": 0.9214943051338196,
+      "learning_rate": 9.989880394891917e-05,
+      "loss": 0.9868,
+      "mean_token_accuracy": 0.656391030550003,
+      "num_tokens": 984577.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.559676969051361,
+      "epoch": 0.7030527289546716,
+      "grad_norm": 0.9955084919929504,
+      "learning_rate": 9.988884334935823e-05,
+      "loss": 0.9135,
+      "mean_token_accuracy": 0.6811449110507966,
+      "num_tokens": 997960.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.5445342898368835,
+      "epoch": 0.7123034227567068,
+      "grad_norm": 0.9573326706886292,
+      "learning_rate": 9.987841582543525e-05,
+      "loss": 0.9002,
+      "mean_token_accuracy": 0.6902318239212036,
+      "num_tokens": 1011462.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.542565393447876,
+      "epoch": 0.7215541165587419,
+      "grad_norm": 0.9867991805076599,
+      "learning_rate": 9.986752147474449e-05,
+      "loss": 0.9514,
+      "mean_token_accuracy": 0.6688324213027954,
+      "num_tokens": 1024674.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.5256419897079467,
+      "epoch": 0.730804810360777,
+      "grad_norm": 0.9705437421798706,
+      "learning_rate": 9.985616039924938e-05,
+      "loss": 0.8626,
+      "mean_token_accuracy": 0.6957047760486603,
+      "num_tokens": 1037752.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.548219668865204,
+      "epoch": 0.7400555041628122,
+      "grad_norm": 1.084227204322815,
+      "learning_rate": 9.984433270528158e-05,
+      "loss": 0.9711,
+      "mean_token_accuracy": 0.6522482097148895,
+      "num_tokens": 1050884.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.517548406124115,
+      "epoch": 0.7493061979648473,
+      "grad_norm": 1.3059539794921875,
+      "learning_rate": 9.983203850353996e-05,
+      "loss": 0.8743,
+      "mean_token_accuracy": 0.6982108950614929,
+      "num_tokens": 1064422.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.562002694606781,
+      "epoch": 0.7585568917668826,
+      "grad_norm": 1.038290023803711,
+      "learning_rate": 9.981927790908955e-05,
+      "loss": 0.9767,
+      "mean_token_accuracy": 0.6550276100635528,
+      "num_tokens": 1077313.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.5344805240631103,
+      "epoch": 0.7678075855689177,
+      "grad_norm": 1.287285327911377,
+      "learning_rate": 9.980605104136054e-05,
+      "loss": 0.9671,
+      "mean_token_accuracy": 0.6576497316360473,
+      "num_tokens": 1090303.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.5300293803215026,
+      "epoch": 0.7770582793709528,
+      "grad_norm": 0.9832729697227478,
+      "learning_rate": 9.979235802414704e-05,
+      "loss": 0.9237,
+      "mean_token_accuracy": 0.6844188570976257,
+      "num_tokens": 1103125.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.5637137055397035,
+      "epoch": 0.786308973172988,
+      "grad_norm": 1.0553216934204102,
+      "learning_rate": 9.977819898560605e-05,
+      "loss": 0.9595,
+      "mean_token_accuracy": 0.6634797155857086,
+      "num_tokens": 1116372.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.56386079788208,
+      "epoch": 0.7955596669750231,
+      "grad_norm": 1.0578479766845703,
+      "learning_rate": 9.976357405825617e-05,
+      "loss": 0.9763,
+      "mean_token_accuracy": 0.6628308475017548,
+      "num_tokens": 1129730.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.528944981098175,
+      "epoch": 0.8048103607770583,
+      "grad_norm": 1.1011334657669067,
+      "learning_rate": 9.97484833789764e-05,
+      "loss": 0.8523,
+      "mean_token_accuracy": 0.7008255600929261,
+      "num_tokens": 1143105.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.5662498712539672,
+      "epoch": 0.8140610545790934,
+      "grad_norm": 0.8881069421768188,
+      "learning_rate": 9.973292708900484e-05,
+      "loss": 0.9357,
+      "mean_token_accuracy": 0.6682270467281342,
+      "num_tokens": 1155545.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.5809241771697997,
+      "epoch": 0.8233117483811286,
+      "grad_norm": 1.1357725858688354,
+      "learning_rate": 9.971690533393741e-05,
+      "loss": 0.9769,
+      "mean_token_accuracy": 0.6525391936302185,
+      "num_tokens": 1169079.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.5619606137275697,
+      "epoch": 0.8325624421831638,
+      "grad_norm": 1.3776752948760986,
+      "learning_rate": 9.970041826372639e-05,
+      "loss": 0.9695,
+      "mean_token_accuracy": 0.6635968685150146,
+      "num_tokens": 1182074.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.5364328384399415,
+      "epoch": 0.8418131359851989,
+      "grad_norm": 1.1025310754776,
+      "learning_rate": 9.968346603267912e-05,
+      "loss": 0.9185,
+      "mean_token_accuracy": 0.6784902811050415,
+      "num_tokens": 1195155.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.5355506658554077,
+      "epoch": 0.851063829787234,
+      "grad_norm": 0.9506418108940125,
+      "learning_rate": 9.966604879945659e-05,
+      "loss": 0.9145,
+      "mean_token_accuracy": 0.6740457832813262,
+      "num_tokens": 1207971.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.5638957381248475,
+      "epoch": 0.8603145235892692,
+      "grad_norm": 1.1018052101135254,
+      "learning_rate": 9.964816672707172e-05,
+      "loss": 0.9725,
+      "mean_token_accuracy": 0.6551022946834564,
+      "num_tokens": 1221499.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.5352379202842712,
+      "epoch": 0.8695652173913043,
+      "grad_norm": 1.0132355690002441,
+      "learning_rate": 9.962981998288813e-05,
+      "loss": 0.8719,
+      "mean_token_accuracy": 0.6871542632579803,
+      "num_tokens": 1234784.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.5783620834350587,
+      "epoch": 0.8788159111933395,
+      "grad_norm": 1.024332880973816,
+      "learning_rate": 9.96110087386184e-05,
+      "loss": 0.9353,
+      "mean_token_accuracy": 0.6774326741695404,
+      "num_tokens": 1248222.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.540341329574585,
+      "epoch": 0.8880666049953746,
+      "grad_norm": 1.044959545135498,
+      "learning_rate": 9.959173317032247e-05,
+      "loss": 0.9502,
+      "mean_token_accuracy": 0.6629186689853668,
+      "num_tokens": 1261151.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.5592219948768615,
+      "epoch": 0.8973172987974098,
+      "grad_norm": 0.9212714433670044,
+      "learning_rate": 9.957199345840609e-05,
+      "loss": 0.9693,
+      "mean_token_accuracy": 0.6631014049053192,
+      "num_tokens": 1274605.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.543090546131134,
+      "epoch": 0.906567992599445,
+      "grad_norm": 1.0074114799499512,
+      "learning_rate": 9.955178978761901e-05,
+      "loss": 0.9454,
+      "mean_token_accuracy": 0.6679804205894471,
+      "num_tokens": 1287486.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.5354868412017821,
+      "epoch": 0.9158186864014801,
+      "grad_norm": 1.107582688331604,
+      "learning_rate": 9.953112234705333e-05,
+      "loss": 0.8522,
+      "mean_token_accuracy": 0.703345662355423,
+      "num_tokens": 1301186.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.5681379914283753,
+      "epoch": 0.9250693802035153,
+      "grad_norm": 0.9099576473236084,
+      "learning_rate": 9.950999133014171e-05,
+      "loss": 0.9867,
+      "mean_token_accuracy": 0.6551129937171936,
+      "num_tokens": 1314154.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.567443573474884,
+      "epoch": 0.9343200740055504,
+      "grad_norm": 1.4343048334121704,
+      "learning_rate": 9.948839693465558e-05,
+      "loss": 0.8902,
+      "mean_token_accuracy": 0.6830959975719452,
+      "num_tokens": 1327333.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.5958142399787902,
+      "epoch": 0.9435707678075855,
+      "grad_norm": 0.9880425333976746,
+      "learning_rate": 9.946633936270318e-05,
+      "loss": 1.0509,
+      "mean_token_accuracy": 0.6263476490974427,
+      "num_tokens": 1340958.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.5669716715812683,
+      "epoch": 0.9528214616096207,
+      "grad_norm": 1.0334872007369995,
+      "learning_rate": 9.944381882072786e-05,
+      "loss": 0.9664,
+      "mean_token_accuracy": 0.6644764959812164,
+      "num_tokens": 1353850.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.5883784770965577,
+      "epoch": 0.9620721554116559,
+      "grad_norm": 1.1019115447998047,
+      "learning_rate": 9.942083551950598e-05,
+      "loss": 0.9923,
+      "mean_token_accuracy": 0.6469030797481536,
+      "num_tokens": 1367556.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.5479753255844115,
+      "epoch": 0.971322849213691,
+      "grad_norm": 1.1472551822662354,
+      "learning_rate": 9.939738967414505e-05,
+      "loss": 0.9273,
+      "mean_token_accuracy": 0.6670134663581848,
+      "num_tokens": 1380637.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.5589645266532899,
+      "epoch": 0.9805735430157262,
+      "grad_norm": 1.028989315032959,
+      "learning_rate": 9.937348150408159e-05,
+      "loss": 0.9592,
+      "mean_token_accuracy": 0.6594673275947571,
+      "num_tokens": 1393569.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.5476471185684204,
+      "epoch": 0.9898242368177613,
+      "grad_norm": 1.1020350456237793,
+      "learning_rate": 9.934911123307921e-05,
+      "loss": 0.9043,
+      "mean_token_accuracy": 0.6805355906486511,
+      "num_tokens": 1406782.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.5664406776428224,
+      "epoch": 0.9990749306197965,
+      "grad_norm": 1.1525558233261108,
+      "learning_rate": 9.932427908922647e-05,
+      "loss": 0.9545,
+      "mean_token_accuracy": 0.6627410531044007,
+      "num_tokens": 1420398.0,
+      "step": 1080
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10810,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.044373117999821e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70d35cdda86487b34a22e099225d60cf4a9bcebf0faa180a692b90c154874ffa
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-1081/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d024b67dfcd2b3be75af3799cf96f3b78068a30f8e4ebd4c5a1ce83bf2539173
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70d35cdda86487b34a22e099225d60cf4a9bcebf0faa180a692b90c154874ffa
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-10810/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:575f4f225b0a06d84781058cdb8b48a68052b73e1f14952068983df72770039b
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2194 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2162,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2675078868865968,
+      "epoch": 0.009250693802035153,
+      "grad_norm": 5.175303936004639,
+      "learning_rate": 1.6635859519408503e-06,
+      "loss": 1.3297,
+      "mean_token_accuracy": 0.6369022190570831,
+      "num_tokens": 13606.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2817180395126342,
+      "epoch": 0.018501387604070305,
+      "grad_norm": 4.235851287841797,
+      "learning_rate": 3.512014787430684e-06,
+      "loss": 1.3307,
+      "mean_token_accuracy": 0.6350878715515137,
+      "num_tokens": 27217.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.262493872642517,
+      "epoch": 0.027752081406105456,
+      "grad_norm": 5.058028221130371,
+      "learning_rate": 5.360443622920518e-06,
+      "loss": 1.2136,
+      "mean_token_accuracy": 0.6545175433158874,
+      "num_tokens": 40014.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2985574364662171,
+      "epoch": 0.03700277520814061,
+      "grad_norm": 3.649226665496826,
+      "learning_rate": 7.2088724584103514e-06,
+      "loss": 1.266,
+      "mean_token_accuracy": 0.6147747755050659,
+      "num_tokens": 53225.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.314147698879242,
+      "epoch": 0.04625346901017576,
+      "grad_norm": 1.8768671751022339,
+      "learning_rate": 9.057301293900185e-06,
+      "loss": 1.0651,
+      "mean_token_accuracy": 0.6504510939121246,
+      "num_tokens": 66702.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.3419164538383483,
+      "epoch": 0.05550416281221091,
+      "grad_norm": 1.107013463973999,
+      "learning_rate": 1.0905730129390019e-05,
+      "loss": 0.9847,
+      "mean_token_accuracy": 0.6611938953399659,
+      "num_tokens": 79234.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.3964227795600892,
+      "epoch": 0.06475485661424607,
+      "grad_norm": 1.1377882957458496,
+      "learning_rate": 1.2754158964879853e-05,
+      "loss": 1.0259,
+      "mean_token_accuracy": 0.6437336921691894,
+      "num_tokens": 91832.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.4071043848991394,
+      "epoch": 0.07400555041628122,
+      "grad_norm": 1.1729836463928223,
+      "learning_rate": 1.4602587800369685e-05,
+      "loss": 0.9942,
+      "mean_token_accuracy": 0.6535955727100372,
+      "num_tokens": 104786.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.3836156249046325,
+      "epoch": 0.08325624421831637,
+      "grad_norm": 1.2727314233779907,
+      "learning_rate": 1.645101663585952e-05,
+      "loss": 0.9512,
+      "mean_token_accuracy": 0.6672091960906983,
+      "num_tokens": 117713.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.3533575654029846,
+      "epoch": 0.09250693802035152,
+      "grad_norm": 1.0665016174316406,
+      "learning_rate": 1.8299445471349355e-05,
+      "loss": 0.9175,
+      "mean_token_accuracy": 0.6751855850219727,
+      "num_tokens": 130661.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.3368927597999574,
+      "epoch": 0.10175763182238667,
+      "grad_norm": 0.7565245032310486,
+      "learning_rate": 2.0147874306839186e-05,
+      "loss": 0.8875,
+      "mean_token_accuracy": 0.6832059919834137,
+      "num_tokens": 144367.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.3387869954109193,
+      "epoch": 0.11100832562442182,
+      "grad_norm": 0.8816360831260681,
+      "learning_rate": 2.1996303142329023e-05,
+      "loss": 0.8901,
+      "mean_token_accuracy": 0.6898956418037414,
+      "num_tokens": 158098.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.3420647621154784,
+      "epoch": 0.12025901942645699,
+      "grad_norm": 0.9925475716590881,
+      "learning_rate": 2.3844731977818853e-05,
+      "loss": 0.922,
+      "mean_token_accuracy": 0.674148154258728,
+      "num_tokens": 171386.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.3588476538658143,
+      "epoch": 0.12950971322849214,
+      "grad_norm": 1.2224054336547852,
+      "learning_rate": 2.5693160813308687e-05,
+      "loss": 0.9012,
+      "mean_token_accuracy": 0.6888944447040558,
+      "num_tokens": 184011.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.3528849005699157,
+      "epoch": 0.13876040703052728,
+      "grad_norm": 0.8460184931755066,
+      "learning_rate": 2.754158964879852e-05,
+      "loss": 0.9482,
+      "mean_token_accuracy": 0.6652046918869019,
+      "num_tokens": 197201.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.3713510990142823,
+      "epoch": 0.14801110083256244,
+      "grad_norm": 1.1473662853240967,
+      "learning_rate": 2.9390018484288358e-05,
+      "loss": 0.9201,
+      "mean_token_accuracy": 0.6809458076953888,
+      "num_tokens": 209879.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.4071080327033996,
+      "epoch": 0.1572617946345976,
+      "grad_norm": 0.9973745942115784,
+      "learning_rate": 3.123844731977819e-05,
+      "loss": 0.98,
+      "mean_token_accuracy": 0.6592511236667633,
+      "num_tokens": 223079.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.3867051601409912,
+      "epoch": 0.16651248843663274,
+      "grad_norm": 1.2232855558395386,
+      "learning_rate": 3.308687615526803e-05,
+      "loss": 0.9706,
+      "mean_token_accuracy": 0.6540988445281982,
+      "num_tokens": 236579.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.3836822986602784,
+      "epoch": 0.1757631822386679,
+      "grad_norm": 1.4032989740371704,
+      "learning_rate": 3.493530499075786e-05,
+      "loss": 0.9215,
+      "mean_token_accuracy": 0.6792983472347259,
+      "num_tokens": 249412.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.3905784249305726,
+      "epoch": 0.18501387604070305,
+      "grad_norm": 1.32985520362854,
+      "learning_rate": 3.678373382624769e-05,
+      "loss": 0.9527,
+      "mean_token_accuracy": 0.6650208413600922,
+      "num_tokens": 262063.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.37818124294281,
+      "epoch": 0.1942645698427382,
+      "grad_norm": 1.0119655132293701,
+      "learning_rate": 3.8632162661737526e-05,
+      "loss": 0.914,
+      "mean_token_accuracy": 0.683707457780838,
+      "num_tokens": 275794.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.4193374395370484,
+      "epoch": 0.20351526364477335,
+      "grad_norm": 0.9579392671585083,
+      "learning_rate": 4.048059149722736e-05,
+      "loss": 0.9657,
+      "mean_token_accuracy": 0.6614095211029053,
+      "num_tokens": 289044.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.3844568610191346,
+      "epoch": 0.2127659574468085,
+      "grad_norm": 1.020785927772522,
+      "learning_rate": 4.232902033271719e-05,
+      "loss": 0.8909,
+      "mean_token_accuracy": 0.6828382730484008,
+      "num_tokens": 302504.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.3760874032974244,
+      "epoch": 0.22201665124884365,
+      "grad_norm": 0.9178677201271057,
+      "learning_rate": 4.4177449168207024e-05,
+      "loss": 0.7948,
+      "mean_token_accuracy": 0.7197233438491821,
+      "num_tokens": 315957.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.3967938780784608,
+      "epoch": 0.23126734505087881,
+      "grad_norm": 1.388061285018921,
+      "learning_rate": 4.602587800369686e-05,
+      "loss": 0.9133,
+      "mean_token_accuracy": 0.6771849095821381,
+      "num_tokens": 328806.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.4414387702941895,
+      "epoch": 0.24051803885291398,
+      "grad_norm": 0.9865493774414062,
+      "learning_rate": 4.787430683918669e-05,
+      "loss": 0.9665,
+      "mean_token_accuracy": 0.6627395570278167,
+      "num_tokens": 342048.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.4030232906341553,
+      "epoch": 0.24976873265494912,
+      "grad_norm": 1.1840286254882812,
+      "learning_rate": 4.972273567467653e-05,
+      "loss": 0.9092,
+      "mean_token_accuracy": 0.6842468678951263,
+      "num_tokens": 354934.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.4368361949920654,
+      "epoch": 0.2590194264569843,
+      "grad_norm": 1.1045658588409424,
+      "learning_rate": 5.157116451016636e-05,
+      "loss": 0.9749,
+      "mean_token_accuracy": 0.6620967447757721,
+      "num_tokens": 367651.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.4251260161399841,
+      "epoch": 0.2682701202590194,
+      "grad_norm": 0.9880494475364685,
+      "learning_rate": 5.3419593345656196e-05,
+      "loss": 0.9046,
+      "mean_token_accuracy": 0.6790169060230256,
+      "num_tokens": 381015.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.4615792393684388,
+      "epoch": 0.27752081406105455,
+      "grad_norm": 1.1569000482559204,
+      "learning_rate": 5.5268022181146026e-05,
+      "loss": 1.0025,
+      "mean_token_accuracy": 0.644273191690445,
+      "num_tokens": 393696.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.4423722743988037,
+      "epoch": 0.28677150786308975,
+      "grad_norm": 1.2215638160705566,
+      "learning_rate": 5.711645101663586e-05,
+      "loss": 0.9301,
+      "mean_token_accuracy": 0.6762896001338958,
+      "num_tokens": 407205.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.431472909450531,
+      "epoch": 0.2960222016651249,
+      "grad_norm": 0.7539701461791992,
+      "learning_rate": 5.89648798521257e-05,
+      "loss": 0.9627,
+      "mean_token_accuracy": 0.6622189521789551,
+      "num_tokens": 420423.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.4409304022789002,
+      "epoch": 0.30527289546716,
+      "grad_norm": 1.0752846002578735,
+      "learning_rate": 6.081330868761553e-05,
+      "loss": 0.9608,
+      "mean_token_accuracy": 0.6622848808765411,
+      "num_tokens": 433792.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.4193753361701966,
+      "epoch": 0.3145235892691952,
+      "grad_norm": 1.0436838865280151,
+      "learning_rate": 6.266173752310537e-05,
+      "loss": 0.9149,
+      "mean_token_accuracy": 0.678329062461853,
+      "num_tokens": 447028.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.474712109565735,
+      "epoch": 0.32377428307123035,
+      "grad_norm": 0.8787930011749268,
+      "learning_rate": 6.45101663585952e-05,
+      "loss": 1.0224,
+      "mean_token_accuracy": 0.639413845539093,
+      "num_tokens": 460030.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.446873426437378,
+      "epoch": 0.3330249768732655,
+      "grad_norm": 1.028443455696106,
+      "learning_rate": 6.635859519408502e-05,
+      "loss": 0.9809,
+      "mean_token_accuracy": 0.652896037697792,
+      "num_tokens": 473511.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.4507364869117736,
+      "epoch": 0.3422756706753006,
+      "grad_norm": 0.9386515617370605,
+      "learning_rate": 6.820702402957486e-05,
+      "loss": 0.9594,
+      "mean_token_accuracy": 0.6635801315307617,
+      "num_tokens": 487056.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.439187967777252,
+      "epoch": 0.3515263644773358,
+      "grad_norm": 1.075317144393921,
+      "learning_rate": 7.00554528650647e-05,
+      "loss": 0.9509,
+      "mean_token_accuracy": 0.6723007440567017,
+      "num_tokens": 500910.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.4200530290603637,
+      "epoch": 0.36077705827937095,
+      "grad_norm": 0.821780800819397,
+      "learning_rate": 7.190388170055453e-05,
+      "loss": 0.8318,
+      "mean_token_accuracy": 0.7095312774181366,
+      "num_tokens": 513491.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.4265631794929505,
+      "epoch": 0.3700277520814061,
+      "grad_norm": 1.0542839765548706,
+      "learning_rate": 7.375231053604437e-05,
+      "loss": 0.906,
+      "mean_token_accuracy": 0.6781497061252594,
+      "num_tokens": 526457.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.4734636306762696,
+      "epoch": 0.3792784458834413,
+      "grad_norm": 0.8997613787651062,
+      "learning_rate": 7.56007393715342e-05,
+      "loss": 0.9399,
+      "mean_token_accuracy": 0.6672972977161408,
+      "num_tokens": 539654.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.4279640436172485,
+      "epoch": 0.3885291396854764,
+      "grad_norm": 1.0399622917175293,
+      "learning_rate": 7.744916820702403e-05,
+      "loss": 0.8637,
+      "mean_token_accuracy": 0.6964247941970825,
+      "num_tokens": 552489.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.4684167385101319,
+      "epoch": 0.39777983348751156,
+      "grad_norm": 0.8020175695419312,
+      "learning_rate": 7.929759704251387e-05,
+      "loss": 0.9586,
+      "mean_token_accuracy": 0.6565007865428925,
+      "num_tokens": 565491.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.474439013004303,
+      "epoch": 0.4070305272895467,
+      "grad_norm": 1.190341591835022,
+      "learning_rate": 8.114602587800369e-05,
+      "loss": 0.9494,
+      "mean_token_accuracy": 0.658382123708725,
+      "num_tokens": 579034.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.5036101937294006,
+      "epoch": 0.4162812210915819,
+      "grad_norm": 0.981868326663971,
+      "learning_rate": 8.299445471349353e-05,
+      "loss": 1.0627,
+      "mean_token_accuracy": 0.6313726782798768,
+      "num_tokens": 591893.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.4632315397262574,
+      "epoch": 0.425531914893617,
+      "grad_norm": 1.076858639717102,
+      "learning_rate": 8.484288354898337e-05,
+      "loss": 0.9065,
+      "mean_token_accuracy": 0.6824360251426697,
+      "num_tokens": 604649.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.4752267003059387,
+      "epoch": 0.43478260869565216,
+      "grad_norm": 0.8257336020469666,
+      "learning_rate": 8.66913123844732e-05,
+      "loss": 0.9493,
+      "mean_token_accuracy": 0.6698306739330292,
+      "num_tokens": 617820.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.4853222370147705,
+      "epoch": 0.4440333024976873,
+      "grad_norm": 0.8594188094139099,
+      "learning_rate": 8.853974121996304e-05,
+      "loss": 0.9786,
+      "mean_token_accuracy": 0.6549507141113281,
+      "num_tokens": 630699.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.464365589618683,
+      "epoch": 0.4532839962997225,
+      "grad_norm": 1.0990275144577026,
+      "learning_rate": 9.038817005545288e-05,
+      "loss": 0.8889,
+      "mean_token_accuracy": 0.6922601819038391,
+      "num_tokens": 644127.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.489177119731903,
+      "epoch": 0.46253469010175763,
+      "grad_norm": 0.9509454369544983,
+      "learning_rate": 9.22365988909427e-05,
+      "loss": 0.9426,
+      "mean_token_accuracy": 0.6659547984600067,
+      "num_tokens": 657148.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.4962970852851867,
+      "epoch": 0.47178538390379277,
+      "grad_norm": 1.2484740018844604,
+      "learning_rate": 9.408502772643254e-05,
+      "loss": 0.953,
+      "mean_token_accuracy": 0.6586906611919403,
+      "num_tokens": 670505.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.4610695004463197,
+      "epoch": 0.48103607770582796,
+      "grad_norm": 1.038205862045288,
+      "learning_rate": 9.593345656192237e-05,
+      "loss": 0.8792,
+      "mean_token_accuracy": 0.695543521642685,
+      "num_tokens": 683406.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.543120539188385,
+      "epoch": 0.4902867715078631,
+      "grad_norm": 0.6887747645378113,
+      "learning_rate": 9.778188539741221e-05,
+      "loss": 1.0731,
+      "mean_token_accuracy": 0.6240392029285431,
+      "num_tokens": 695965.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.5109227538108825,
+      "epoch": 0.49953746530989823,
+      "grad_norm": 1.0471116304397583,
+      "learning_rate": 9.963031423290203e-05,
+      "loss": 0.8617,
+      "mean_token_accuracy": 0.6985665023326874,
+      "num_tokens": 709172.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.4863034248352052,
+      "epoch": 0.5087881591119334,
+      "grad_norm": 0.9132720828056335,
+      "learning_rate": 9.999985025125081e-05,
+      "loss": 0.8525,
+      "mean_token_accuracy": 0.7047226369380951,
+      "num_tokens": 722271.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.517179274559021,
+      "epoch": 0.5180388529139686,
+      "grad_norm": 0.8982908129692078,
+      "learning_rate": 9.999924189849452e-05,
+      "loss": 0.9545,
+      "mean_token_accuracy": 0.6619311034679413,
+      "num_tokens": 735490.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.5107035517692566,
+      "epoch": 0.5272895467160037,
+      "grad_norm": 1.1028233766555786,
+      "learning_rate": 9.99981655881237e-05,
+      "loss": 1.008,
+      "mean_token_accuracy": 0.6386784017086029,
+      "num_tokens": 748441.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.4935171365737916,
+      "epoch": 0.5365402405180388,
+      "grad_norm": 1.5201911926269531,
+      "learning_rate": 9.999662133021185e-05,
+      "loss": 0.9209,
+      "mean_token_accuracy": 0.6722200810909271,
+      "num_tokens": 761485.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.5198947191238403,
+      "epoch": 0.545790934320074,
+      "grad_norm": 1.154875636100769,
+      "learning_rate": 9.999460913921211e-05,
+      "loss": 0.9829,
+      "mean_token_accuracy": 0.6530335664749145,
+      "num_tokens": 775045.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.5305540323257447,
+      "epoch": 0.5550416281221091,
+      "grad_norm": 0.9236158132553101,
+      "learning_rate": 9.99921290339572e-05,
+      "loss": 1.0093,
+      "mean_token_accuracy": 0.6421272337436676,
+      "num_tokens": 787827.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.5291874051094054,
+      "epoch": 0.5642923219241444,
+      "grad_norm": 1.0652803182601929,
+      "learning_rate": 9.998918103765914e-05,
+      "loss": 0.927,
+      "mean_token_accuracy": 0.6654485881328582,
+      "num_tokens": 801418.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.550955057144165,
+      "epoch": 0.5735430157261795,
+      "grad_norm": 0.9596770405769348,
+      "learning_rate": 9.99857651779091e-05,
+      "loss": 1.0393,
+      "mean_token_accuracy": 0.6354499399662018,
+      "num_tokens": 814457.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.501573884487152,
+      "epoch": 0.5827937095282146,
+      "grad_norm": 0.9417737722396851,
+      "learning_rate": 9.998188148667708e-05,
+      "loss": 0.8915,
+      "mean_token_accuracy": 0.6816763877868652,
+      "num_tokens": 827714.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.4670620203018188,
+      "epoch": 0.5920444033302498,
+      "grad_norm": 1.3990159034729004,
+      "learning_rate": 9.997753000031175e-05,
+      "loss": 0.8596,
+      "mean_token_accuracy": 0.6988556385040283,
+      "num_tokens": 840400.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.528640353679657,
+      "epoch": 0.6012950971322849,
+      "grad_norm": 1.0697306394577026,
+      "learning_rate": 9.997271075953994e-05,
+      "loss": 0.9713,
+      "mean_token_accuracy": 0.6468679904937744,
+      "num_tokens": 853028.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.5169320225715637,
+      "epoch": 0.61054579093432,
+      "grad_norm": 1.1989524364471436,
+      "learning_rate": 9.996742380946628e-05,
+      "loss": 0.9643,
+      "mean_token_accuracy": 0.6630256116390228,
+      "num_tokens": 866211.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.486118495464325,
+      "epoch": 0.6197964847363552,
+      "grad_norm": 1.3864704370498657,
+      "learning_rate": 9.996166919957297e-05,
+      "loss": 0.9255,
+      "mean_token_accuracy": 0.6708253383636474,
+      "num_tokens": 879248.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.5209161520004273,
+      "epoch": 0.6290471785383904,
+      "grad_norm": 0.8406121134757996,
+      "learning_rate": 9.995544698371904e-05,
+      "loss": 0.9067,
+      "mean_token_accuracy": 0.6848369777202606,
+      "num_tokens": 892532.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.5373739361763001,
+      "epoch": 0.6382978723404256,
+      "grad_norm": 1.0999748706817627,
+      "learning_rate": 9.994875722014008e-05,
+      "loss": 1.014,
+      "mean_token_accuracy": 0.6408933758735657,
+      "num_tokens": 905340.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.516480839252472,
+      "epoch": 0.6475485661424607,
+      "grad_norm": 0.9417734742164612,
+      "learning_rate": 9.994159997144752e-05,
+      "loss": 0.9749,
+      "mean_token_accuracy": 0.6582231283187866,
+      "num_tokens": 918565.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.5324442982673645,
+      "epoch": 0.6567992599444958,
+      "grad_norm": 0.9670525193214417,
+      "learning_rate": 9.993397530462818e-05,
+      "loss": 0.9952,
+      "mean_token_accuracy": 0.6441864192485809,
+      "num_tokens": 931590.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.5277332067489624,
+      "epoch": 0.666049953746531,
+      "grad_norm": 1.1460349559783936,
+      "learning_rate": 9.992588329104354e-05,
+      "loss": 0.9238,
+      "mean_token_accuracy": 0.677095913887024,
+      "num_tokens": 945000.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.5624989986419677,
+      "epoch": 0.6753006475485661,
+      "grad_norm": 0.9031643867492676,
+      "learning_rate": 9.991732400642916e-05,
+      "loss": 0.9745,
+      "mean_token_accuracy": 0.6575322329998017,
+      "num_tokens": 958208.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.5397494316101075,
+      "epoch": 0.6845513413506013,
+      "grad_norm": 0.9962924122810364,
+      "learning_rate": 9.990829753089389e-05,
+      "loss": 0.9186,
+      "mean_token_accuracy": 0.6840661704540253,
+      "num_tokens": 971666.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.5536072373390197,
+      "epoch": 0.6938020351526365,
+      "grad_norm": 0.9214943051338196,
+      "learning_rate": 9.989880394891917e-05,
+      "loss": 0.9868,
+      "mean_token_accuracy": 0.656391030550003,
+      "num_tokens": 984577.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.559676969051361,
+      "epoch": 0.7030527289546716,
+      "grad_norm": 0.9955084919929504,
+      "learning_rate": 9.988884334935823e-05,
+      "loss": 0.9135,
+      "mean_token_accuracy": 0.6811449110507966,
+      "num_tokens": 997960.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.5445342898368835,
+      "epoch": 0.7123034227567068,
+      "grad_norm": 0.9573326706886292,
+      "learning_rate": 9.987841582543525e-05,
+      "loss": 0.9002,
+      "mean_token_accuracy": 0.6902318239212036,
+      "num_tokens": 1011462.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.542565393447876,
+      "epoch": 0.7215541165587419,
+      "grad_norm": 0.9867991805076599,
+      "learning_rate": 9.986752147474449e-05,
+      "loss": 0.9514,
+      "mean_token_accuracy": 0.6688324213027954,
+      "num_tokens": 1024674.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.5256419897079467,
+      "epoch": 0.730804810360777,
+      "grad_norm": 0.9705437421798706,
+      "learning_rate": 9.985616039924938e-05,
+      "loss": 0.8626,
+      "mean_token_accuracy": 0.6957047760486603,
+      "num_tokens": 1037752.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.548219668865204,
+      "epoch": 0.7400555041628122,
+      "grad_norm": 1.084227204322815,
+      "learning_rate": 9.984433270528158e-05,
+      "loss": 0.9711,
+      "mean_token_accuracy": 0.6522482097148895,
+      "num_tokens": 1050884.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.517548406124115,
+      "epoch": 0.7493061979648473,
+      "grad_norm": 1.3059539794921875,
+      "learning_rate": 9.983203850353996e-05,
+      "loss": 0.8743,
+      "mean_token_accuracy": 0.6982108950614929,
+      "num_tokens": 1064422.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.562002694606781,
+      "epoch": 0.7585568917668826,
+      "grad_norm": 1.038290023803711,
+      "learning_rate": 9.981927790908955e-05,
+      "loss": 0.9767,
+      "mean_token_accuracy": 0.6550276100635528,
+      "num_tokens": 1077313.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.5344805240631103,
+      "epoch": 0.7678075855689177,
+      "grad_norm": 1.287285327911377,
+      "learning_rate": 9.980605104136054e-05,
+      "loss": 0.9671,
+      "mean_token_accuracy": 0.6576497316360473,
+      "num_tokens": 1090303.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.5300293803215026,
+      "epoch": 0.7770582793709528,
+      "grad_norm": 0.9832729697227478,
+      "learning_rate": 9.979235802414704e-05,
+      "loss": 0.9237,
+      "mean_token_accuracy": 0.6844188570976257,
+      "num_tokens": 1103125.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.5637137055397035,
+      "epoch": 0.786308973172988,
+      "grad_norm": 1.0553216934204102,
+      "learning_rate": 9.977819898560605e-05,
+      "loss": 0.9595,
+      "mean_token_accuracy": 0.6634797155857086,
+      "num_tokens": 1116372.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.56386079788208,
+      "epoch": 0.7955596669750231,
+      "grad_norm": 1.0578479766845703,
+      "learning_rate": 9.976357405825617e-05,
+      "loss": 0.9763,
+      "mean_token_accuracy": 0.6628308475017548,
+      "num_tokens": 1129730.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.528944981098175,
+      "epoch": 0.8048103607770583,
+      "grad_norm": 1.1011334657669067,
+      "learning_rate": 9.97484833789764e-05,
+      "loss": 0.8523,
+      "mean_token_accuracy": 0.7008255600929261,
+      "num_tokens": 1143105.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.5662498712539672,
+      "epoch": 0.8140610545790934,
+      "grad_norm": 0.8881069421768188,
+      "learning_rate": 9.973292708900484e-05,
+      "loss": 0.9357,
+      "mean_token_accuracy": 0.6682270467281342,
+      "num_tokens": 1155545.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.5809241771697997,
+      "epoch": 0.8233117483811286,
+      "grad_norm": 1.1357725858688354,
+      "learning_rate": 9.971690533393741e-05,
+      "loss": 0.9769,
+      "mean_token_accuracy": 0.6525391936302185,
+      "num_tokens": 1169079.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.5619606137275697,
+      "epoch": 0.8325624421831638,
+      "grad_norm": 1.3776752948760986,
+      "learning_rate": 9.970041826372639e-05,
+      "loss": 0.9695,
+      "mean_token_accuracy": 0.6635968685150146,
+      "num_tokens": 1182074.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.5364328384399415,
+      "epoch": 0.8418131359851989,
+      "grad_norm": 1.1025310754776,
+      "learning_rate": 9.968346603267912e-05,
+      "loss": 0.9185,
+      "mean_token_accuracy": 0.6784902811050415,
+      "num_tokens": 1195155.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.5355506658554077,
+      "epoch": 0.851063829787234,
+      "grad_norm": 0.9506418108940125,
+      "learning_rate": 9.966604879945659e-05,
+      "loss": 0.9145,
+      "mean_token_accuracy": 0.6740457832813262,
+      "num_tokens": 1207971.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.5638957381248475,
+      "epoch": 0.8603145235892692,
+      "grad_norm": 1.1018052101135254,
+      "learning_rate": 9.964816672707172e-05,
+      "loss": 0.9725,
+      "mean_token_accuracy": 0.6551022946834564,
+      "num_tokens": 1221499.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.5352379202842712,
+      "epoch": 0.8695652173913043,
+      "grad_norm": 1.0132355690002441,
+      "learning_rate": 9.962981998288813e-05,
+      "loss": 0.8719,
+      "mean_token_accuracy": 0.6871542632579803,
+      "num_tokens": 1234784.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.5783620834350587,
+      "epoch": 0.8788159111933395,
+      "grad_norm": 1.024332880973816,
+      "learning_rate": 9.96110087386184e-05,
+      "loss": 0.9353,
+      "mean_token_accuracy": 0.6774326741695404,
+      "num_tokens": 1248222.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.540341329574585,
+      "epoch": 0.8880666049953746,
+      "grad_norm": 1.044959545135498,
+      "learning_rate": 9.959173317032247e-05,
+      "loss": 0.9502,
+      "mean_token_accuracy": 0.6629186689853668,
+      "num_tokens": 1261151.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.5592219948768615,
+      "epoch": 0.8973172987974098,
+      "grad_norm": 0.9212714433670044,
+      "learning_rate": 9.957199345840609e-05,
+      "loss": 0.9693,
+      "mean_token_accuracy": 0.6631014049053192,
+      "num_tokens": 1274605.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.543090546131134,
+      "epoch": 0.906567992599445,
+      "grad_norm": 1.0074114799499512,
+      "learning_rate": 9.955178978761901e-05,
+      "loss": 0.9454,
+      "mean_token_accuracy": 0.6679804205894471,
+      "num_tokens": 1287486.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.5354868412017821,
+      "epoch": 0.9158186864014801,
+      "grad_norm": 1.107582688331604,
+      "learning_rate": 9.953112234705333e-05,
+      "loss": 0.8522,
+      "mean_token_accuracy": 0.703345662355423,
+      "num_tokens": 1301186.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.5681379914283753,
+      "epoch": 0.9250693802035153,
+      "grad_norm": 0.9099576473236084,
+      "learning_rate": 9.950999133014171e-05,
+      "loss": 0.9867,
+      "mean_token_accuracy": 0.6551129937171936,
+      "num_tokens": 1314154.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.567443573474884,
+      "epoch": 0.9343200740055504,
+      "grad_norm": 1.4343048334121704,
+      "learning_rate": 9.948839693465558e-05,
+      "loss": 0.8902,
+      "mean_token_accuracy": 0.6830959975719452,
+      "num_tokens": 1327333.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.5958142399787902,
+      "epoch": 0.9435707678075855,
+      "grad_norm": 0.9880425333976746,
+      "learning_rate": 9.946633936270318e-05,
+      "loss": 1.0509,
+      "mean_token_accuracy": 0.6263476490974427,
+      "num_tokens": 1340958.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.5669716715812683,
+      "epoch": 0.9528214616096207,
+      "grad_norm": 1.0334872007369995,
+      "learning_rate": 9.944381882072786e-05,
+      "loss": 0.9664,
+      "mean_token_accuracy": 0.6644764959812164,
+      "num_tokens": 1353850.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.5883784770965577,
+      "epoch": 0.9620721554116559,
+      "grad_norm": 1.1019115447998047,
+      "learning_rate": 9.942083551950598e-05,
+      "loss": 0.9923,
+      "mean_token_accuracy": 0.6469030797481536,
+      "num_tokens": 1367556.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.5479753255844115,
+      "epoch": 0.971322849213691,
+      "grad_norm": 1.1472551822662354,
+      "learning_rate": 9.939738967414505e-05,
+      "loss": 0.9273,
+      "mean_token_accuracy": 0.6670134663581848,
+      "num_tokens": 1380637.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.5589645266532899,
+      "epoch": 0.9805735430157262,
+      "grad_norm": 1.028989315032959,
+      "learning_rate": 9.937348150408159e-05,
+      "loss": 0.9592,
+      "mean_token_accuracy": 0.6594673275947571,
+      "num_tokens": 1393569.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.5476471185684204,
+      "epoch": 0.9898242368177613,
+      "grad_norm": 1.1020350456237793,
+      "learning_rate": 9.934911123307921e-05,
+      "loss": 0.9043,
+      "mean_token_accuracy": 0.6805355906486511,
+      "num_tokens": 1406782.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.5664406776428224,
+      "epoch": 0.9990749306197965,
+      "grad_norm": 1.1525558233261108,
+      "learning_rate": 9.932427908922647e-05,
+      "loss": 0.9545,
+      "mean_token_accuracy": 0.6627410531044007,
+      "num_tokens": 1420398.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.5590189814567565,
+      "epoch": 1.0083256244218317,
+      "grad_norm": 1.1105467081069946,
+      "learning_rate": 9.92989853049347e-05,
+      "loss": 0.939,
+      "mean_token_accuracy": 0.6663093864917755,
+      "num_tokens": 1432931.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.507142388820648,
+      "epoch": 1.0175763182238668,
+      "grad_norm": 1.2052907943725586,
+      "learning_rate": 9.927323011693585e-05,
+      "loss": 0.808,
+      "mean_token_accuracy": 0.7172160148620605,
+      "num_tokens": 1445583.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.5202176928520204,
+      "epoch": 1.026827012025902,
+      "grad_norm": 1.2583285570144653,
+      "learning_rate": 9.924701376628032e-05,
+      "loss": 0.8395,
+      "mean_token_accuracy": 0.7034675240516662,
+      "num_tokens": 1458584.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.542691159248352,
+      "epoch": 1.0360777058279371,
+      "grad_norm": 1.2120987176895142,
+      "learning_rate": 9.922033649833462e-05,
+      "loss": 0.9186,
+      "mean_token_accuracy": 0.674918931722641,
+      "num_tokens": 1471458.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.5525382518768311,
+      "epoch": 1.0453283996299723,
+      "grad_norm": 1.3977634906768799,
+      "learning_rate": 9.91931985627792e-05,
+      "loss": 0.9375,
+      "mean_token_accuracy": 0.6736328840255738,
+      "num_tokens": 1485192.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.5510458946228027,
+      "epoch": 1.0545790934320074,
+      "grad_norm": 1.1912108659744263,
+      "learning_rate": 9.916560021360593e-05,
+      "loss": 0.8935,
+      "mean_token_accuracy": 0.6817401230335236,
+      "num_tokens": 1498505.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.5085613250732421,
+      "epoch": 1.0638297872340425,
+      "grad_norm": 1.4452115297317505,
+      "learning_rate": 9.913754170911592e-05,
+      "loss": 0.8431,
+      "mean_token_accuracy": 0.7055911183357239,
+      "num_tokens": 1511241.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.4957143902778625,
+      "epoch": 1.0730804810360777,
+      "grad_norm": 1.3652784824371338,
+      "learning_rate": 9.910902331191693e-05,
+      "loss": 0.7957,
+      "mean_token_accuracy": 0.7180816411972046,
+      "num_tokens": 1525367.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.527734887599945,
+      "epoch": 1.0823311748381128,
+      "grad_norm": 1.291463851928711,
+      "learning_rate": 9.908004528892107e-05,
+      "loss": 0.9465,
+      "mean_token_accuracy": 0.6600941479206085,
+      "num_tokens": 1539092.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.5234104871749878,
+      "epoch": 1.091581868640148,
+      "grad_norm": 1.1057231426239014,
+      "learning_rate": 9.905060791134215e-05,
+      "loss": 0.8971,
+      "mean_token_accuracy": 0.6782453298568726,
+      "num_tokens": 1551652.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.498512291908264,
+      "epoch": 1.100832562442183,
+      "grad_norm": 1.1634302139282227,
+      "learning_rate": 9.902071145469326e-05,
+      "loss": 0.8391,
+      "mean_token_accuracy": 0.7078715145587922,
+      "num_tokens": 1564975.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.5350223302841186,
+      "epoch": 1.1100832562442182,
+      "grad_norm": 1.351274847984314,
+      "learning_rate": 9.899035619878414e-05,
+      "loss": 0.9231,
+      "mean_token_accuracy": 0.6709426641464233,
+      "num_tokens": 1577673.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.5172717928886414,
+      "epoch": 1.1193339500462534,
+      "grad_norm": 1.6556415557861328,
+      "learning_rate": 9.895954242771857e-05,
+      "loss": 0.8369,
+      "mean_token_accuracy": 0.7084704637527466,
+      "num_tokens": 1590025.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.5295075058937073,
+      "epoch": 1.1285846438482887,
+      "grad_norm": 1.1777360439300537,
+      "learning_rate": 9.892827042989165e-05,
+      "loss": 0.9039,
+      "mean_token_accuracy": 0.6778266191482544,
+      "num_tokens": 1603718.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.5300594091415405,
+      "epoch": 1.1378353376503239,
+      "grad_norm": 1.4627207517623901,
+      "learning_rate": 9.889654049798727e-05,
+      "loss": 0.8765,
+      "mean_token_accuracy": 0.6937211096286774,
+      "num_tokens": 1617186.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.5501411199569701,
+      "epoch": 1.147086031452359,
+      "grad_norm": 1.2685083150863647,
+      "learning_rate": 9.886435292897513e-05,
+      "loss": 0.9389,
+      "mean_token_accuracy": 0.6703861057758331,
+      "num_tokens": 1630131.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.5451602935791016,
+      "epoch": 1.1563367252543941,
+      "grad_norm": 1.3802698850631714,
+      "learning_rate": 9.883170802410823e-05,
+      "loss": 0.9181,
+      "mean_token_accuracy": 0.6823853075504303,
+      "num_tokens": 1642821.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.5610610961914062,
+      "epoch": 1.1655874190564293,
+      "grad_norm": 1.3597441911697388,
+      "learning_rate": 9.87986060889198e-05,
+      "loss": 0.916,
+      "mean_token_accuracy": 0.6728592872619629,
+      "num_tokens": 1656369.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.5185763478279113,
+      "epoch": 1.1748381128584644,
+      "grad_norm": 1.4725332260131836,
+      "learning_rate": 9.876504743322057e-05,
+      "loss": 0.8854,
+      "mean_token_accuracy": 0.6831559896469116,
+      "num_tokens": 1669426.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.52033189535141,
+      "epoch": 1.1840888066604995,
+      "grad_norm": 1.5024479627609253,
+      "learning_rate": 9.873103237109592e-05,
+      "loss": 0.9079,
+      "mean_token_accuracy": 0.6803531587123871,
+      "num_tokens": 1682729.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.5095137238502503,
+      "epoch": 1.1933395004625347,
+      "grad_norm": 1.4787187576293945,
+      "learning_rate": 9.869656122090278e-05,
+      "loss": 0.8965,
+      "mean_token_accuracy": 0.6727837383747101,
+      "num_tokens": 1695958.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.4895215392112733,
+      "epoch": 1.2025901942645698,
+      "grad_norm": 1.2908035516738892,
+      "learning_rate": 9.866163430526682e-05,
+      "loss": 0.8423,
+      "mean_token_accuracy": 0.7072388172149658,
+      "num_tokens": 1708886.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.5007799863815308,
+      "epoch": 1.211840888066605,
+      "grad_norm": 1.2461620569229126,
+      "learning_rate": 9.86262519510793e-05,
+      "loss": 0.8371,
+      "mean_token_accuracy": 0.7080549716949462,
+      "num_tokens": 1722148.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.527963149547577,
+      "epoch": 1.22109158186864,
+      "grad_norm": 1.7748533487319946,
+      "learning_rate": 9.859041448949408e-05,
+      "loss": 0.9122,
+      "mean_token_accuracy": 0.6771592617034912,
+      "num_tokens": 1734675.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.5244122385978698,
+      "epoch": 1.2303422756706752,
+      "grad_norm": 1.1288245916366577,
+      "learning_rate": 9.855412225592453e-05,
+      "loss": 0.881,
+      "mean_token_accuracy": 0.6893887996673584,
+      "num_tokens": 1748366.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.5070539236068725,
+      "epoch": 1.2395929694727104,
+      "grad_norm": 1.713269591331482,
+      "learning_rate": 9.85173755900403e-05,
+      "loss": 0.8328,
+      "mean_token_accuracy": 0.7106398522853852,
+      "num_tokens": 1761460.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.5405545592308045,
+      "epoch": 1.2488436632747457,
+      "grad_norm": 1.5272797346115112,
+      "learning_rate": 9.84801748357643e-05,
+      "loss": 0.9288,
+      "mean_token_accuracy": 0.6747880399227142,
+      "num_tokens": 1774852.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.5236643552780151,
+      "epoch": 1.2580943570767809,
+      "grad_norm": 1.0686043500900269,
+      "learning_rate": 9.844252034126929e-05,
+      "loss": 0.9284,
+      "mean_token_accuracy": 0.6757431268692017,
+      "num_tokens": 1787880.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.5348403930664063,
+      "epoch": 1.267345050878816,
+      "grad_norm": 1.2732406854629517,
+      "learning_rate": 9.840441245897476e-05,
+      "loss": 0.9667,
+      "mean_token_accuracy": 0.6587969183921814,
+      "num_tokens": 1801119.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.5348328948020935,
+      "epoch": 1.2765957446808511,
+      "grad_norm": 1.2598549127578735,
+      "learning_rate": 9.836585154554362e-05,
+      "loss": 0.8884,
+      "mean_token_accuracy": 0.6853720486164093,
+      "num_tokens": 1814076.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.5214549064636231,
+      "epoch": 1.2858464384828863,
+      "grad_norm": 1.324201226234436,
+      "learning_rate": 9.832683796187878e-05,
+      "loss": 0.8884,
+      "mean_token_accuracy": 0.6889445841312408,
+      "num_tokens": 1827050.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.532666027545929,
+      "epoch": 1.2950971322849214,
+      "grad_norm": 1.1742793321609497,
+      "learning_rate": 9.828737207311986e-05,
+      "loss": 0.8838,
+      "mean_token_accuracy": 0.6876072645187378,
+      "num_tokens": 1840821.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.529241418838501,
+      "epoch": 1.3043478260869565,
+      "grad_norm": 1.4952448606491089,
+      "learning_rate": 9.824745424863973e-05,
+      "loss": 0.8956,
+      "mean_token_accuracy": 0.6810737133026123,
+      "num_tokens": 1853871.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.5505860686302184,
+      "epoch": 1.3135985198889917,
+      "grad_norm": 1.3099291324615479,
+      "learning_rate": 9.820708486204105e-05,
+      "loss": 0.9357,
+      "mean_token_accuracy": 0.6686550080776215,
+      "num_tokens": 1867332.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.5253514289855956,
+      "epoch": 1.3228492136910268,
+      "grad_norm": 1.2284352779388428,
+      "learning_rate": 9.816626429115276e-05,
+      "loss": 0.8606,
+      "mean_token_accuracy": 0.7042006254196167,
+      "num_tokens": 1880327.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.5825935482978821,
+      "epoch": 1.332099907493062,
+      "grad_norm": 1.3153958320617676,
+      "learning_rate": 9.812499291802663e-05,
+      "loss": 0.9991,
+      "mean_token_accuracy": 0.6431185126304626,
+      "num_tokens": 1893222.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.5448525667190551,
+      "epoch": 1.341350601295097,
+      "grad_norm": 1.3306663036346436,
+      "learning_rate": 9.808327112893356e-05,
+      "loss": 0.8683,
+      "mean_token_accuracy": 0.695203572511673,
+      "num_tokens": 1906670.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.5516000151634217,
+      "epoch": 1.3506012950971322,
+      "grad_norm": 1.3316166400909424,
+      "learning_rate": 9.804109931436005e-05,
+      "loss": 0.8646,
+      "mean_token_accuracy": 0.6990422964096069,
+      "num_tokens": 1920384.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.5267711400985717,
+      "epoch": 1.3598519888991674,
+      "grad_norm": 1.3226373195648193,
+      "learning_rate": 9.799847786900453e-05,
+      "loss": 0.8768,
+      "mean_token_accuracy": 0.6939898490905761,
+      "num_tokens": 1933149.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.5407219767570495,
+      "epoch": 1.3691026827012025,
+      "grad_norm": 1.4068697690963745,
+      "learning_rate": 9.795540719177365e-05,
+      "loss": 0.8969,
+      "mean_token_accuracy": 0.682057934999466,
+      "num_tokens": 1945899.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.5163644909858705,
+      "epoch": 1.3783533765032376,
+      "grad_norm": 1.3001612424850464,
+      "learning_rate": 9.791188768577851e-05,
+      "loss": 0.818,
+      "mean_token_accuracy": 0.7132867455482483,
+      "num_tokens": 1958939.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.5620590686798095,
+      "epoch": 1.3876040703052728,
+      "grad_norm": 1.4245188236236572,
+      "learning_rate": 9.786791975833101e-05,
+      "loss": 0.9319,
+      "mean_token_accuracy": 0.6648121118545532,
+      "num_tokens": 1972114.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.5767237067222595,
+      "epoch": 1.396854764107308,
+      "grad_norm": 1.2391294240951538,
+      "learning_rate": 9.782350382093993e-05,
+      "loss": 0.9605,
+      "mean_token_accuracy": 0.6504848659038543,
+      "num_tokens": 1985507.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.5531885027885437,
+      "epoch": 1.4061054579093433,
+      "grad_norm": 1.127484679222107,
+      "learning_rate": 9.777864028930705e-05,
+      "loss": 0.9028,
+      "mean_token_accuracy": 0.6821092009544373,
+      "num_tokens": 1998202.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.5488298177719115,
+      "epoch": 1.4153561517113784,
+      "grad_norm": 1.4220211505889893,
+      "learning_rate": 9.773332958332337e-05,
+      "loss": 0.8728,
+      "mean_token_accuracy": 0.6968444228172302,
+      "num_tokens": 2011296.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.5239035129547118,
+      "epoch": 1.4246068455134135,
+      "grad_norm": 1.204034447669983,
+      "learning_rate": 9.768757212706515e-05,
+      "loss": 0.8686,
+      "mean_token_accuracy": 0.6997561752796173,
+      "num_tokens": 2024206.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.5552138090133667,
+      "epoch": 1.4338575393154487,
+      "grad_norm": 1.3252023458480835,
+      "learning_rate": 9.764136834878986e-05,
+      "loss": 0.9111,
+      "mean_token_accuracy": 0.6722259402275086,
+      "num_tokens": 2036934.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.5242789149284364,
+      "epoch": 1.4431082331174838,
+      "grad_norm": 1.1921788454055786,
+      "learning_rate": 9.759471868093226e-05,
+      "loss": 0.8687,
+      "mean_token_accuracy": 0.6929886519908905,
+      "num_tokens": 2049932.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.5449042797088623,
+      "epoch": 1.452358926919519,
+      "grad_norm": 0.9806531071662903,
+      "learning_rate": 9.754762356010032e-05,
+      "loss": 0.9215,
+      "mean_token_accuracy": 0.6735273063182831,
+      "num_tokens": 2063142.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.5611037492752076,
+      "epoch": 1.461609620721554,
+      "grad_norm": 1.1823029518127441,
+      "learning_rate": 9.750008342707113e-05,
+      "loss": 0.9753,
+      "mean_token_accuracy": 0.6566466450691223,
+      "num_tokens": 2075764.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.5282315135002136,
+      "epoch": 1.4708603145235892,
+      "grad_norm": 1.1169211864471436,
+      "learning_rate": 9.745209872678677e-05,
+      "loss": 0.8698,
+      "mean_token_accuracy": 0.6906334280967712,
+      "num_tokens": 2089251.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.5164031744003297,
+      "epoch": 1.4801110083256244,
+      "grad_norm": 1.2573009729385376,
+      "learning_rate": 9.740366990835018e-05,
+      "loss": 0.8727,
+      "mean_token_accuracy": 0.6923319816589355,
+      "num_tokens": 2102337.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.5354086637496949,
+      "epoch": 1.4893617021276595,
+      "grad_norm": 1.3182599544525146,
+      "learning_rate": 9.735479742502089e-05,
+      "loss": 0.8748,
+      "mean_token_accuracy": 0.6936228930950165,
+      "num_tokens": 2115556.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.5114194750785828,
+      "epoch": 1.4986123959296949,
+      "grad_norm": 1.6607362031936646,
+      "learning_rate": 9.73054817342109e-05,
+      "loss": 0.8662,
+      "mean_token_accuracy": 0.6974299073219299,
+      "num_tokens": 2128937.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.5332423567771911,
+      "epoch": 1.50786308973173,
+      "grad_norm": 1.3788079023361206,
+      "learning_rate": 9.725572329748022e-05,
+      "loss": 0.9138,
+      "mean_token_accuracy": 0.6795754253864288,
+      "num_tokens": 2141986.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.5237389087677002,
+      "epoch": 1.5171137835337651,
+      "grad_norm": 1.6337846517562866,
+      "learning_rate": 9.720552258053275e-05,
+      "loss": 0.9173,
+      "mean_token_accuracy": 0.6762649834156036,
+      "num_tokens": 2155201.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.547822082042694,
+      "epoch": 1.5263644773358003,
+      "grad_norm": 1.4085562229156494,
+      "learning_rate": 9.715488005321171e-05,
+      "loss": 0.9752,
+      "mean_token_accuracy": 0.6518037557601929,
+      "num_tokens": 2168643.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.5555969715118407,
+      "epoch": 1.5356151711378354,
+      "grad_norm": 1.471933126449585,
+      "learning_rate": 9.710379618949546e-05,
+      "loss": 0.8985,
+      "mean_token_accuracy": 0.6819450974464416,
+      "num_tokens": 2181997.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.5805710434913636,
+      "epoch": 1.5448658649398705,
+      "grad_norm": 1.3684574365615845,
+      "learning_rate": 9.705227146749289e-05,
+      "loss": 0.9373,
+      "mean_token_accuracy": 0.6641357362270355,
+      "num_tokens": 2195400.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.5455497264862061,
+      "epoch": 1.5541165587419057,
+      "grad_norm": 1.4113730192184448,
+      "learning_rate": 9.700030636943901e-05,
+      "loss": 0.8293,
+      "mean_token_accuracy": 0.70506911277771,
+      "num_tokens": 2209124.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.5282757997512817,
+      "epoch": 1.5633672525439408,
+      "grad_norm": 1.3547190427780151,
+      "learning_rate": 9.694790138169051e-05,
+      "loss": 0.8661,
+      "mean_token_accuracy": 0.6936029970645905,
+      "num_tokens": 2222519.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.5811396718025208,
+      "epoch": 1.572617946345976,
+      "grad_norm": 1.1947416067123413,
+      "learning_rate": 9.689505699472105e-05,
+      "loss": 0.9738,
+      "mean_token_accuracy": 0.6506883263587951,
+      "num_tokens": 2235477.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.5465192794799805,
+      "epoch": 1.581868640148011,
+      "grad_norm": 1.0432610511779785,
+      "learning_rate": 9.68417737031168e-05,
+      "loss": 0.9252,
+      "mean_token_accuracy": 0.6725632548332214,
+      "num_tokens": 2248619.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.557973897457123,
+      "epoch": 1.5911193339500462,
+      "grad_norm": 1.1087877750396729,
+      "learning_rate": 9.678805200557177e-05,
+      "loss": 0.8889,
+      "mean_token_accuracy": 0.68295316696167,
+      "num_tokens": 2261406.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.5115222930908203,
+      "epoch": 1.6003700277520814,
+      "grad_norm": 1.2408086061477661,
+      "learning_rate": 9.673389240488313e-05,
+      "loss": 0.8288,
+      "mean_token_accuracy": 0.7108204662799835,
+      "num_tokens": 2274581.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.524597156047821,
+      "epoch": 1.6096207215541165,
+      "grad_norm": 1.1500499248504639,
+      "learning_rate": 9.667929540794652e-05,
+      "loss": 0.8663,
+      "mean_token_accuracy": 0.6921479344367981,
+      "num_tokens": 2288061.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.5250072598457336,
+      "epoch": 1.6188714153561516,
+      "grad_norm": 1.2787007093429565,
+      "learning_rate": 9.662426152575128e-05,
+      "loss": 0.8453,
+      "mean_token_accuracy": 0.6999235332012177,
+      "num_tokens": 2300734.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.5432110905647278,
+      "epoch": 1.6281221091581868,
+      "grad_norm": 1.3493939638137817,
+      "learning_rate": 9.656879127337571e-05,
+      "loss": 0.9228,
+      "mean_token_accuracy": 0.6721714973449707,
+      "num_tokens": 2314230.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.547983467578888,
+      "epoch": 1.637372802960222,
+      "grad_norm": 1.2188769578933716,
+      "learning_rate": 9.651288516998225e-05,
+      "loss": 0.9188,
+      "mean_token_accuracy": 0.6699378073215485,
+      "num_tokens": 2327688.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.5295732498168946,
+      "epoch": 1.646623496762257,
+      "grad_norm": 1.4545722007751465,
+      "learning_rate": 9.645654373881252e-05,
+      "loss": 0.8885,
+      "mean_token_accuracy": 0.6881523847579956,
+      "num_tokens": 2340180.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.5252346634864806,
+      "epoch": 1.6558741905642922,
+      "grad_norm": 1.209765911102295,
+      "learning_rate": 9.639976750718259e-05,
+      "loss": 0.883,
+      "mean_token_accuracy": 0.6812373876571656,
+      "num_tokens": 2352928.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.5846696734428405,
+      "epoch": 1.6651248843663273,
+      "grad_norm": 1.515471339225769,
+      "learning_rate": 9.634255700647791e-05,
+      "loss": 0.9982,
+      "mean_token_accuracy": 0.6503925740718841,
+      "num_tokens": 2365746.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.5660824060440064,
+      "epoch": 1.6743755781683625,
+      "grad_norm": 1.2912554740905762,
+      "learning_rate": 9.628491277214837e-05,
+      "loss": 0.9329,
+      "mean_token_accuracy": 0.6697620809078216,
+      "num_tokens": 2379127.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.5519426345825196,
+      "epoch": 1.6836262719703978,
+      "grad_norm": 1.1865144968032837,
+      "learning_rate": 9.622683534370332e-05,
+      "loss": 0.8455,
+      "mean_token_accuracy": 0.7048421978950501,
+      "num_tokens": 2392098.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.5589602589607239,
+      "epoch": 1.692876965772433,
+      "grad_norm": 1.545067548751831,
+      "learning_rate": 9.61683252647065e-05,
+      "loss": 0.9298,
+      "mean_token_accuracy": 0.6723642349243164,
+      "num_tokens": 2405401.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.5627625107765197,
+      "epoch": 1.702127659574468,
+      "grad_norm": 1.485780119895935,
+      "learning_rate": 9.610938308277097e-05,
+      "loss": 0.9004,
+      "mean_token_accuracy": 0.6864800393581391,
+      "num_tokens": 2418580.0,
+      "step": 1840
+    },
+    {
+      "entropy": 1.5423608303070069,
+      "epoch": 1.7113783533765032,
+      "grad_norm": 1.442898154258728,
+      "learning_rate": 9.605000934955393e-05,
+      "loss": 0.9016,
+      "mean_token_accuracy": 0.688067364692688,
+      "num_tokens": 2431540.0,
+      "step": 1850
+    },
+    {
+      "entropy": 1.5365403652191163,
+      "epoch": 1.7206290471785384,
+      "grad_norm": 1.4761542081832886,
+      "learning_rate": 9.599020462075164e-05,
+      "loss": 0.847,
+      "mean_token_accuracy": 0.7041001141071319,
+      "num_tokens": 2444522.0,
+      "step": 1860
+    },
+    {
+      "entropy": 1.549383556842804,
+      "epoch": 1.7298797409805735,
+      "grad_norm": 1.1147528886795044,
+      "learning_rate": 9.592996945609413e-05,
+      "loss": 0.9157,
+      "mean_token_accuracy": 0.6800615966320038,
+      "num_tokens": 2457909.0,
+      "step": 1870
+    },
+    {
+      "entropy": 1.5465178847312928,
+      "epoch": 1.7391304347826086,
+      "grad_norm": 1.2626795768737793,
+      "learning_rate": 9.586930441934004e-05,
+      "loss": 0.9013,
+      "mean_token_accuracy": 0.6742372810840607,
+      "num_tokens": 2471291.0,
+      "step": 1880
+    },
+    {
+      "entropy": 1.5268906474113464,
+      "epoch": 1.748381128584644,
+      "grad_norm": 1.4618085622787476,
+      "learning_rate": 9.580821007827128e-05,
+      "loss": 0.8554,
+      "mean_token_accuracy": 0.699239867925644,
+      "num_tokens": 2484157.0,
+      "step": 1890
+    },
+    {
+      "entropy": 1.524414074420929,
+      "epoch": 1.7576318223866791,
+      "grad_norm": 1.6484898328781128,
+      "learning_rate": 9.574668700468777e-05,
+      "loss": 0.8365,
+      "mean_token_accuracy": 0.7023455381393433,
+      "num_tokens": 2497096.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.552185297012329,
+      "epoch": 1.7668825161887143,
+      "grad_norm": 1.3367946147918701,
+      "learning_rate": 9.568473577440207e-05,
+      "loss": 0.9098,
+      "mean_token_accuracy": 0.6807539343833924,
+      "num_tokens": 2510471.0,
+      "step": 1910
+    },
+    {
+      "entropy": 1.5769048929214478,
+      "epoch": 1.7761332099907494,
+      "grad_norm": 1.4459549188613892,
+      "learning_rate": 9.562235696723396e-05,
+      "loss": 0.9932,
+      "mean_token_accuracy": 0.6512850284576416,
+      "num_tokens": 2523676.0,
+      "step": 1920
+    },
+    {
+      "entropy": 1.5604833364486694,
+      "epoch": 1.7853839037927846,
+      "grad_norm": 1.1662452220916748,
+      "learning_rate": 9.555955116700503e-05,
+      "loss": 0.9686,
+      "mean_token_accuracy": 0.6559585571289063,
+      "num_tokens": 2536505.0,
+      "step": 1930
+    },
+    {
+      "entropy": 1.5819882154464722,
+      "epoch": 1.7946345975948197,
+      "grad_norm": 1.775183081626892,
+      "learning_rate": 9.549631896153327e-05,
+      "loss": 0.9667,
+      "mean_token_accuracy": 0.6530949115753174,
+      "num_tokens": 2549977.0,
+      "step": 1940
+    },
+    {
+      "entropy": 1.5256071090698242,
+      "epoch": 1.8038852913968548,
+      "grad_norm": 1.5862903594970703,
+      "learning_rate": 9.543266094262748e-05,
+      "loss": 0.8469,
+      "mean_token_accuracy": 0.7041983604431152,
+      "num_tokens": 2563118.0,
+      "step": 1950
+    },
+    {
+      "entropy": 1.5496267795562744,
+      "epoch": 1.81313598519889,
+      "grad_norm": 1.4400105476379395,
+      "learning_rate": 9.536857770608178e-05,
+      "loss": 0.8863,
+      "mean_token_accuracy": 0.690715366601944,
+      "num_tokens": 2575860.0,
+      "step": 1960
+    },
+    {
+      "entropy": 1.582668626308441,
+      "epoch": 1.822386679000925,
+      "grad_norm": 1.3100849390029907,
+      "learning_rate": 9.530406985167004e-05,
+      "loss": 0.9788,
+      "mean_token_accuracy": 0.6574987709522248,
+      "num_tokens": 2589171.0,
+      "step": 1970
+    },
+    {
+      "entropy": 1.5688007354736329,
+      "epoch": 1.8316373728029602,
+      "grad_norm": 1.4323651790618896,
+      "learning_rate": 9.523913798314025e-05,
+      "loss": 0.8941,
+      "mean_token_accuracy": 0.6844177067279815,
+      "num_tokens": 2602693.0,
+      "step": 1980
+    },
+    {
+      "entropy": 1.5714410424232483,
+      "epoch": 1.8408880666049954,
+      "grad_norm": 1.203909158706665,
+      "learning_rate": 9.517378270820889e-05,
+      "loss": 0.9738,
+      "mean_token_accuracy": 0.657619196176529,
+      "num_tokens": 2615664.0,
+      "step": 1990
+    },
+    {
+      "entropy": 1.54354647397995,
+      "epoch": 1.8501387604070305,
+      "grad_norm": 1.3018907308578491,
+      "learning_rate": 9.510800463855518e-05,
+      "loss": 0.8754,
+      "mean_token_accuracy": 0.6927144229412079,
+      "num_tokens": 2628527.0,
+      "step": 2000
+    },
+    {
+      "entropy": 1.5385435581207276,
+      "epoch": 1.8593894542090657,
+      "grad_norm": 1.383748173713684,
+      "learning_rate": 9.504180438981543e-05,
+      "loss": 0.8386,
+      "mean_token_accuracy": 0.6973421156406403,
+      "num_tokens": 2641765.0,
+      "step": 2010
+    },
+    {
+      "entropy": 1.5870672106742858,
+      "epoch": 1.8686401480111008,
+      "grad_norm": 1.138781189918518,
+      "learning_rate": 9.497518258157722e-05,
+      "loss": 0.9198,
+      "mean_token_accuracy": 0.6744202077388763,
+      "num_tokens": 2655887.0,
+      "step": 2020
+    },
+    {
+      "entropy": 1.5769222855567933,
+      "epoch": 1.877890841813136,
+      "grad_norm": 1.1709132194519043,
+      "learning_rate": 9.49081398373737e-05,
+      "loss": 0.9213,
+      "mean_token_accuracy": 0.6796155214309693,
+      "num_tokens": 2669120.0,
+      "step": 2030
+    },
+    {
+      "entropy": 1.5950719594955445,
+      "epoch": 1.887141535615171,
+      "grad_norm": 1.243532419204712,
+      "learning_rate": 9.484067678467761e-05,
+      "loss": 0.9889,
+      "mean_token_accuracy": 0.6540985763072967,
+      "num_tokens": 2682431.0,
+      "step": 2040
+    },
+    {
+      "entropy": 1.5713389277458192,
+      "epoch": 1.8963922294172062,
+      "grad_norm": 1.2355759143829346,
+      "learning_rate": 9.477279405489552e-05,
+      "loss": 0.9337,
+      "mean_token_accuracy": 0.6738641381263732,
+      "num_tokens": 2695264.0,
+      "step": 2050
+    },
+    {
+      "entropy": 1.5699865460395812,
+      "epoch": 1.9056429232192413,
+      "grad_norm": 1.1436556577682495,
+      "learning_rate": 9.470449228336184e-05,
+      "loss": 0.9077,
+      "mean_token_accuracy": 0.6798655807971954,
+      "num_tokens": 2709261.0,
+      "step": 2060
+    },
+    {
+      "entropy": 1.5231060266494751,
+      "epoch": 1.9148936170212765,
+      "grad_norm": 1.6024360656738281,
+      "learning_rate": 9.463577210933299e-05,
+      "loss": 0.9189,
+      "mean_token_accuracy": 0.6639018952846527,
+      "num_tokens": 2722051.0,
+      "step": 2070
+    },
+    {
+      "entropy": 1.5486979722976684,
+      "epoch": 1.9241443108233116,
+      "grad_norm": 1.1532946825027466,
+      "learning_rate": 9.456663417598126e-05,
+      "loss": 0.9257,
+      "mean_token_accuracy": 0.6788547515869141,
+      "num_tokens": 2735469.0,
+      "step": 2080
+    },
+    {
+      "entropy": 1.5938722610473632,
+      "epoch": 1.9333950046253467,
+      "grad_norm": 1.583771824836731,
+      "learning_rate": 9.449707913038893e-05,
+      "loss": 1.0085,
+      "mean_token_accuracy": 0.6393784880638123,
+      "num_tokens": 2748097.0,
+      "step": 2090
+    },
+    {
+      "entropy": 1.5397228717803955,
+      "epoch": 1.942645698427382,
+      "grad_norm": 1.5729622840881348,
+      "learning_rate": 9.44271076235421e-05,
+      "loss": 0.8944,
+      "mean_token_accuracy": 0.6839666903018952,
+      "num_tokens": 2761360.0,
+      "step": 2100
+    },
+    {
+      "entropy": 1.5583428025245667,
+      "epoch": 1.9518963922294172,
+      "grad_norm": 1.355100154876709,
+      "learning_rate": 9.435672031032474e-05,
+      "loss": 0.9482,
+      "mean_token_accuracy": 0.6667573392391205,
+      "num_tokens": 2775017.0,
+      "step": 2110
+    },
+    {
+      "entropy": 1.5730461478233337,
+      "epoch": 1.9611470860314524,
+      "grad_norm": 1.2492750883102417,
+      "learning_rate": 9.428591784951237e-05,
+      "loss": 0.9278,
+      "mean_token_accuracy": 0.6671026587486267,
+      "num_tokens": 2788381.0,
+      "step": 2120
+    },
+    {
+      "entropy": 1.5341883182525635,
+      "epoch": 1.9703977798334875,
+      "grad_norm": 1.1517435312271118,
+      "learning_rate": 9.421470090376609e-05,
+      "loss": 0.8996,
+      "mean_token_accuracy": 0.6873630166053772,
+      "num_tokens": 2801167.0,
+      "step": 2130
+    },
+    {
+      "entropy": 1.5374996066093445,
+      "epoch": 1.9796484736355227,
+      "grad_norm": 1.4243314266204834,
+      "learning_rate": 9.414307013962623e-05,
+      "loss": 0.8341,
+      "mean_token_accuracy": 0.7012212157249451,
+      "num_tokens": 2814560.0,
+      "step": 2140
+    },
+    {
+      "entropy": 1.5451178312301637,
+      "epoch": 1.9888991674375578,
+      "grad_norm": 1.2103217840194702,
+      "learning_rate": 9.407102622750617e-05,
+      "loss": 0.893,
+      "mean_token_accuracy": 0.6850167155265808,
+      "num_tokens": 2827299.0,
+      "step": 2150
+    },
+    {
+      "entropy": 1.5628185272216797,
+      "epoch": 1.998149861239593,
+      "grad_norm": 1.5071104764938354,
+      "learning_rate": 9.399856984168612e-05,
+      "loss": 0.9713,
+      "mean_token_accuracy": 0.6638465642929077,
+      "num_tokens": 2839715.0,
+      "step": 2160
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10810,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2088746235999642e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70d35cdda86487b34a22e099225d60cf4a9bcebf0faa180a692b90c154874ffa
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-2162/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd1feb35537a2b0ec9e96bcb35395b23751011db9b5e89502ec9403163164a3
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-3243/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896