agu18dec commited on 28 days ago

Commit

1d6ec55

verified ·

1 Parent(s): e77c492

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/README.md +61 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/trainer_state.json +1774 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/trainer_state.json +2644 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/trainer_state.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -600,3 +600,14 @@ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noS
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-8144/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-9162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-8144/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-9162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-5232/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-6104/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-6976/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-7848/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-872/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-8720/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/7w4qne7u)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.19.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.9.1
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4497725543943b3ffc62917072d81017d4a0be55b8f2c8ef0ebfd55b9aeb2831
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4825512ba858d36680597e1a9abb6e2a74e6725df30d73a3b77a5f4d369ebe7f
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1774 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1744,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2426587224006653,
+      "epoch": 0.011467889908256881,
+      "grad_norm": 3.028918981552124,
+      "learning_rate": 2.0642201834862385e-06,
+      "loss": 0.6546,
+      "mean_token_accuracy": 0.7775660812854767,
+      "num_tokens": 14294.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2369988679885864,
+      "epoch": 0.022935779816513763,
+      "grad_norm": 2.726299524307251,
+      "learning_rate": 4.357798165137615e-06,
+      "loss": 0.6567,
+      "mean_token_accuracy": 0.7740493595600129,
+      "num_tokens": 28037.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2392263770103455,
+      "epoch": 0.034403669724770644,
+      "grad_norm": 2.2062387466430664,
+      "learning_rate": 6.651376146788992e-06,
+      "loss": 0.6411,
+      "mean_token_accuracy": 0.7753027558326722,
+      "num_tokens": 42268.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2435187816619873,
+      "epoch": 0.045871559633027525,
+      "grad_norm": 2.4188315868377686,
+      "learning_rate": 8.944954128440369e-06,
+      "loss": 0.6271,
+      "mean_token_accuracy": 0.7827982664108276,
+      "num_tokens": 56474.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.261301326751709,
+      "epoch": 0.05733944954128441,
+      "grad_norm": 2.490365982055664,
+      "learning_rate": 1.1238532110091744e-05,
+      "loss": 0.6343,
+      "mean_token_accuracy": 0.7792274355888367,
+      "num_tokens": 70348.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2830254554748535,
+      "epoch": 0.06880733944954129,
+      "grad_norm": 2.6820173263549805,
+      "learning_rate": 1.3532110091743119e-05,
+      "loss": 0.6298,
+      "mean_token_accuracy": 0.7811978995800019,
+      "num_tokens": 84857.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.275150680541992,
+      "epoch": 0.08027522935779817,
+      "grad_norm": 1.8796888589859009,
+      "learning_rate": 1.5825688073394497e-05,
+      "loss": 0.6689,
+      "mean_token_accuracy": 0.7587406218051911,
+      "num_tokens": 99396.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2276832342147828,
+      "epoch": 0.09174311926605505,
+      "grad_norm": 1.690596580505371,
+      "learning_rate": 1.811926605504587e-05,
+      "loss": 0.5586,
+      "mean_token_accuracy": 0.8031339049339294,
+      "num_tokens": 112619.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.2304488182067872,
+      "epoch": 0.10321100917431193,
+      "grad_norm": 1.7719478607177734,
+      "learning_rate": 2.0412844036697248e-05,
+      "loss": 0.6093,
+      "mean_token_accuracy": 0.7849110841751099,
+      "num_tokens": 126496.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.2812824487686156,
+      "epoch": 0.11467889908256881,
+      "grad_norm": 1.5947109460830688,
+      "learning_rate": 2.2706422018348624e-05,
+      "loss": 0.6455,
+      "mean_token_accuracy": 0.7670063555240632,
+      "num_tokens": 140628.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.2834580063819885,
+      "epoch": 0.12614678899082568,
+      "grad_norm": 1.8809813261032104,
+      "learning_rate": 2.5e-05,
+      "loss": 0.6489,
+      "mean_token_accuracy": 0.7716078400611878,
+      "num_tokens": 153959.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.2611944794654846,
+      "epoch": 0.13761467889908258,
+      "grad_norm": 1.780765414237976,
+      "learning_rate": 2.7293577981651375e-05,
+      "loss": 0.5445,
+      "mean_token_accuracy": 0.8067175269126892,
+      "num_tokens": 168207.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.272983717918396,
+      "epoch": 0.14908256880733944,
+      "grad_norm": 2.119795560836792,
+      "learning_rate": 2.9587155963302755e-05,
+      "loss": 0.6021,
+      "mean_token_accuracy": 0.7827390134334564,
+      "num_tokens": 182042.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.2775203227996825,
+      "epoch": 0.16055045871559634,
+      "grad_norm": 1.4455509185791016,
+      "learning_rate": 3.188073394495413e-05,
+      "loss": 0.5715,
+      "mean_token_accuracy": 0.7992757976055145,
+      "num_tokens": 196015.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.2987091898918153,
+      "epoch": 0.1720183486238532,
+      "grad_norm": 1.4980850219726562,
+      "learning_rate": 3.4174311926605505e-05,
+      "loss": 0.6023,
+      "mean_token_accuracy": 0.7867661654949188,
+      "num_tokens": 210215.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.2755884647369384,
+      "epoch": 0.1834862385321101,
+      "grad_norm": 1.8361093997955322,
+      "learning_rate": 3.646788990825688e-05,
+      "loss": 0.6042,
+      "mean_token_accuracy": 0.7830365121364593,
+      "num_tokens": 224017.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.2698514223098756,
+      "epoch": 0.19495412844036697,
+      "grad_norm": 1.3812596797943115,
+      "learning_rate": 3.876146788990826e-05,
+      "loss": 0.5846,
+      "mean_token_accuracy": 0.7849388122558594,
+      "num_tokens": 237482.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.244412088394165,
+      "epoch": 0.20642201834862386,
+      "grad_norm": 1.798601746559143,
+      "learning_rate": 4.1055045871559636e-05,
+      "loss": 0.547,
+      "mean_token_accuracy": 0.8001754701137542,
+      "num_tokens": 251169.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.2745447516441346,
+      "epoch": 0.21788990825688073,
+      "grad_norm": 1.9844188690185547,
+      "learning_rate": 4.334862385321101e-05,
+      "loss": 0.5623,
+      "mean_token_accuracy": 0.7987869799137115,
+      "num_tokens": 264764.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.3000339031219483,
+      "epoch": 0.22935779816513763,
+      "grad_norm": 1.4861323833465576,
+      "learning_rate": 4.564220183486239e-05,
+      "loss": 0.6123,
+      "mean_token_accuracy": 0.7852487206459046,
+      "num_tokens": 278504.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.3115605235099792,
+      "epoch": 0.2408256880733945,
+      "grad_norm": 1.8251842260360718,
+      "learning_rate": 4.7935779816513766e-05,
+      "loss": 0.658,
+      "mean_token_accuracy": 0.759744155406952,
+      "num_tokens": 293021.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.3224670886993408,
+      "epoch": 0.25229357798165136,
+      "grad_norm": 1.5895862579345703,
+      "learning_rate": 5.022935779816514e-05,
+      "loss": 0.6213,
+      "mean_token_accuracy": 0.7748652398586273,
+      "num_tokens": 306811.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.2993175029754638,
+      "epoch": 0.26376146788990823,
+      "grad_norm": 1.6444000005722046,
+      "learning_rate": 5.252293577981652e-05,
+      "loss": 0.5723,
+      "mean_token_accuracy": 0.7920855104923248,
+      "num_tokens": 320873.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.3019076824188232,
+      "epoch": 0.27522935779816515,
+      "grad_norm": 1.5319479703903198,
+      "learning_rate": 5.481651376146789e-05,
+      "loss": 0.5945,
+      "mean_token_accuracy": 0.7845574736595153,
+      "num_tokens": 335323.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.289823544025421,
+      "epoch": 0.286697247706422,
+      "grad_norm": 1.2900819778442383,
+      "learning_rate": 5.7110091743119266e-05,
+      "loss": 0.5692,
+      "mean_token_accuracy": 0.7950143396854401,
+      "num_tokens": 349575.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.2885443449020386,
+      "epoch": 0.2981651376146789,
+      "grad_norm": 1.408677577972412,
+      "learning_rate": 5.940366972477065e-05,
+      "loss": 0.6104,
+      "mean_token_accuracy": 0.7816850125789643,
+      "num_tokens": 363961.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.3065945267677308,
+      "epoch": 0.30963302752293576,
+      "grad_norm": 1.3809661865234375,
+      "learning_rate": 6.169724770642203e-05,
+      "loss": 0.6426,
+      "mean_token_accuracy": 0.7661891877651215,
+      "num_tokens": 377555.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.2889371395111084,
+      "epoch": 0.3211009174311927,
+      "grad_norm": 1.4974966049194336,
+      "learning_rate": 6.39908256880734e-05,
+      "loss": 0.5882,
+      "mean_token_accuracy": 0.7819968700408936,
+      "num_tokens": 391423.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.2952162742614746,
+      "epoch": 0.33256880733944955,
+      "grad_norm": 1.3621913194656372,
+      "learning_rate": 6.628440366972477e-05,
+      "loss": 0.57,
+      "mean_token_accuracy": 0.7946897804737091,
+      "num_tokens": 405650.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.2888988494873046,
+      "epoch": 0.3440366972477064,
+      "grad_norm": 1.793961524963379,
+      "learning_rate": 6.857798165137616e-05,
+      "loss": 0.6273,
+      "mean_token_accuracy": 0.7732390701770783,
+      "num_tokens": 419332.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.289334809780121,
+      "epoch": 0.3555045871559633,
+      "grad_norm": 1.5518903732299805,
+      "learning_rate": 7.087155963302753e-05,
+      "loss": 0.6492,
+      "mean_token_accuracy": 0.757664144039154,
+      "num_tokens": 433432.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.2899688124656676,
+      "epoch": 0.3669724770642202,
+      "grad_norm": 1.5826157331466675,
+      "learning_rate": 7.31651376146789e-05,
+      "loss": 0.5805,
+      "mean_token_accuracy": 0.7921296834945679,
+      "num_tokens": 447592.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.2863509058952332,
+      "epoch": 0.37844036697247707,
+      "grad_norm": 1.7210900783538818,
+      "learning_rate": 7.545871559633027e-05,
+      "loss": 0.5926,
+      "mean_token_accuracy": 0.7852405548095703,
+      "num_tokens": 462489.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.2890722513198853,
+      "epoch": 0.38990825688073394,
+      "grad_norm": 1.6051267385482788,
+      "learning_rate": 7.775229357798165e-05,
+      "loss": 0.6173,
+      "mean_token_accuracy": 0.7741429924964904,
+      "num_tokens": 476591.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.313070333003998,
+      "epoch": 0.4013761467889908,
+      "grad_norm": 1.7080140113830566,
+      "learning_rate": 8.004587155963303e-05,
+      "loss": 0.6165,
+      "mean_token_accuracy": 0.7842044055461883,
+      "num_tokens": 491338.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.2972561955451964,
+      "epoch": 0.41284403669724773,
+      "grad_norm": 1.7454527616500854,
+      "learning_rate": 8.23394495412844e-05,
+      "loss": 0.5927,
+      "mean_token_accuracy": 0.7841840922832489,
+      "num_tokens": 505152.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.2944241881370544,
+      "epoch": 0.4243119266055046,
+      "grad_norm": 1.8223613500595093,
+      "learning_rate": 8.463302752293578e-05,
+      "loss": 0.5862,
+      "mean_token_accuracy": 0.7846642255783081,
+      "num_tokens": 519536.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.2918418169021606,
+      "epoch": 0.43577981651376146,
+      "grad_norm": 1.323716640472412,
+      "learning_rate": 8.692660550458716e-05,
+      "loss": 0.5761,
+      "mean_token_accuracy": 0.788896131515503,
+      "num_tokens": 533610.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.3106001019477844,
+      "epoch": 0.44724770642201833,
+      "grad_norm": 2.1389827728271484,
+      "learning_rate": 8.922018348623854e-05,
+      "loss": 0.6442,
+      "mean_token_accuracy": 0.7677759766578675,
+      "num_tokens": 547213.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.2924273014068604,
+      "epoch": 0.45871559633027525,
+      "grad_norm": 1.3077127933502197,
+      "learning_rate": 9.151376146788991e-05,
+      "loss": 0.6044,
+      "mean_token_accuracy": 0.7855095267295837,
+      "num_tokens": 560707.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.3057442545890807,
+      "epoch": 0.4701834862385321,
+      "grad_norm": 1.658679723739624,
+      "learning_rate": 9.380733944954129e-05,
+      "loss": 0.5803,
+      "mean_token_accuracy": 0.7926251292228699,
+      "num_tokens": 574533.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.3044120788574218,
+      "epoch": 0.481651376146789,
+      "grad_norm": 1.7965151071548462,
+      "learning_rate": 9.610091743119267e-05,
+      "loss": 0.5984,
+      "mean_token_accuracy": 0.7874112606048584,
+      "num_tokens": 587931.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.3121570587158202,
+      "epoch": 0.49311926605504586,
+      "grad_norm": 1.1833796501159668,
+      "learning_rate": 9.839449541284404e-05,
+      "loss": 0.6231,
+      "mean_token_accuracy": 0.7761680126190186,
+      "num_tokens": 602080.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.3229384422302246,
+      "epoch": 0.5045871559633027,
+      "grad_norm": 1.98506760597229,
+      "learning_rate": 9.99999676404826e-05,
+      "loss": 0.6223,
+      "mean_token_accuracy": 0.774652361869812,
+      "num_tokens": 615535.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.2842121720314026,
+      "epoch": 0.5160550458715596,
+      "grad_norm": 1.8412768840789795,
+      "learning_rate": 9.999939236133826e-05,
+      "loss": 0.5968,
+      "mean_token_accuracy": 0.7840604305267334,
+      "num_tokens": 628767.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.3064908266067505,
+      "epoch": 0.5275229357798165,
+      "grad_norm": 1.7538436651229858,
+      "learning_rate": 9.999809799133033e-05,
+      "loss": 0.6244,
+      "mean_token_accuracy": 0.7701604008674622,
+      "num_tokens": 642874.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.3011385202407837,
+      "epoch": 0.5389908256880734,
+      "grad_norm": 2.0401413440704346,
+      "learning_rate": 9.99960845490744e-05,
+      "loss": 0.5897,
+      "mean_token_accuracy": 0.7876223146915435,
+      "num_tokens": 656374.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.3175038933753966,
+      "epoch": 0.5504587155963303,
+      "grad_norm": 1.5815656185150146,
+      "learning_rate": 9.999335206352783e-05,
+      "loss": 0.6681,
+      "mean_token_accuracy": 0.7586038947105408,
+      "num_tokens": 670397.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.3054586052894592,
+      "epoch": 0.5619266055045872,
+      "grad_norm": 1.7010897397994995,
+      "learning_rate": 9.998990057398916e-05,
+      "loss": 0.6488,
+      "mean_token_accuracy": 0.7646380603313446,
+      "num_tokens": 684143.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.2969472885131836,
+      "epoch": 0.573394495412844,
+      "grad_norm": 2.1294353008270264,
+      "learning_rate": 9.998573013009771e-05,
+      "loss": 0.6505,
+      "mean_token_accuracy": 0.7664439141750335,
+      "num_tokens": 697427.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.3074483752250672,
+      "epoch": 0.5848623853211009,
+      "grad_norm": 2.1683812141418457,
+      "learning_rate": 9.998084079183276e-05,
+      "loss": 0.5897,
+      "mean_token_accuracy": 0.7885696291923523,
+      "num_tokens": 711947.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.2956400752067565,
+      "epoch": 0.5963302752293578,
+      "grad_norm": 1.4167346954345703,
+      "learning_rate": 9.997523262951274e-05,
+      "loss": 0.6388,
+      "mean_token_accuracy": 0.7672183573246002,
+      "num_tokens": 726268.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.315368902683258,
+      "epoch": 0.6077981651376146,
+      "grad_norm": 2.1706671714782715,
+      "learning_rate": 9.996890572379418e-05,
+      "loss": 0.6844,
+      "mean_token_accuracy": 0.7582804381847381,
+      "num_tokens": 740230.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2926068663597108,
+      "epoch": 0.6192660550458715,
+      "grad_norm": 1.6460140943527222,
+      "learning_rate": 9.99618601656706e-05,
+      "loss": 0.5693,
+      "mean_token_accuracy": 0.795549190044403,
+      "num_tokens": 754570.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.2848342299461364,
+      "epoch": 0.6307339449541285,
+      "grad_norm": 1.7705565690994263,
+      "learning_rate": 9.995409605647117e-05,
+      "loss": 0.6189,
+      "mean_token_accuracy": 0.7828136622905731,
+      "num_tokens": 768740.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.3091715574264526,
+      "epoch": 0.6422018348623854,
+      "grad_norm": 1.7903367280960083,
+      "learning_rate": 9.994561350785923e-05,
+      "loss": 0.6096,
+      "mean_token_accuracy": 0.7809465050697326,
+      "num_tokens": 782860.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.3097781181335448,
+      "epoch": 0.6536697247706422,
+      "grad_norm": 1.6261135339736938,
+      "learning_rate": 9.993641264183074e-05,
+      "loss": 0.6488,
+      "mean_token_accuracy": 0.7686248242855072,
+      "num_tokens": 796852.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.2892103433609008,
+      "epoch": 0.6651376146788991,
+      "grad_norm": 1.530013084411621,
+      "learning_rate": 9.992649359071247e-05,
+      "loss": 0.6099,
+      "mean_token_accuracy": 0.7832099735736847,
+      "num_tokens": 810833.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.2781771540641784,
+      "epoch": 0.676605504587156,
+      "grad_norm": 1.3513305187225342,
+      "learning_rate": 9.991585649716014e-05,
+      "loss": 0.6059,
+      "mean_token_accuracy": 0.7849724233150482,
+      "num_tokens": 825129.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.289398467540741,
+      "epoch": 0.6880733944954128,
+      "grad_norm": 1.2714006900787354,
+      "learning_rate": 9.990450151415636e-05,
+      "loss": 0.6262,
+      "mean_token_accuracy": 0.7734242856502533,
+      "num_tokens": 839084.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.3282314896583558,
+      "epoch": 0.6995412844036697,
+      "grad_norm": 1.6062265634536743,
+      "learning_rate": 9.989242880500837e-05,
+      "loss": 0.6804,
+      "mean_token_accuracy": 0.7598551273345947,
+      "num_tokens": 853275.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.279460871219635,
+      "epoch": 0.7110091743119266,
+      "grad_norm": 1.211531400680542,
+      "learning_rate": 9.987963854334581e-05,
+      "loss": 0.5422,
+      "mean_token_accuracy": 0.8087258577346802,
+      "num_tokens": 867001.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.3079694390296936,
+      "epoch": 0.7224770642201835,
+      "grad_norm": 1.9886008501052856,
+      "learning_rate": 9.986613091311811e-05,
+      "loss": 0.6505,
+      "mean_token_accuracy": 0.7643534898757934,
+      "num_tokens": 880836.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.3083110094070434,
+      "epoch": 0.7339449541284404,
+      "grad_norm": 1.7378991842269897,
+      "learning_rate": 9.98519061085919e-05,
+      "loss": 0.6507,
+      "mean_token_accuracy": 0.7652741134166717,
+      "num_tokens": 894456.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.3111968874931335,
+      "epoch": 0.7454128440366973,
+      "grad_norm": 1.6157206296920776,
+      "learning_rate": 9.983696433434821e-05,
+      "loss": 0.6009,
+      "mean_token_accuracy": 0.7828308165073394,
+      "num_tokens": 908581.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.3001808285713197,
+      "epoch": 0.7568807339449541,
+      "grad_norm": 1.7530412673950195,
+      "learning_rate": 9.982130580527951e-05,
+      "loss": 0.5973,
+      "mean_token_accuracy": 0.7872715950012207,
+      "num_tokens": 922198.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.3001506924629211,
+      "epoch": 0.768348623853211,
+      "grad_norm": 1.8743090629577637,
+      "learning_rate": 9.980493074658665e-05,
+      "loss": 0.5991,
+      "mean_token_accuracy": 0.7848590850830078,
+      "num_tokens": 934965.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.3329032421112061,
+      "epoch": 0.7798165137614679,
+      "grad_norm": 1.646851658821106,
+      "learning_rate": 9.978783939377558e-05,
+      "loss": 0.646,
+      "mean_token_accuracy": 0.76202232837677,
+      "num_tokens": 949474.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.3042344450950623,
+      "epoch": 0.7912844036697247,
+      "grad_norm": 1.6828117370605469,
+      "learning_rate": 9.9770031992654e-05,
+      "loss": 0.5663,
+      "mean_token_accuracy": 0.7932763636112213,
+      "num_tokens": 963414.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.3154001832008362,
+      "epoch": 0.8027522935779816,
+      "grad_norm": 1.8354583978652954,
+      "learning_rate": 9.975150879932784e-05,
+      "loss": 0.5994,
+      "mean_token_accuracy": 0.7792726159095764,
+      "num_tokens": 977203.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.307938539981842,
+      "epoch": 0.8142201834862385,
+      "grad_norm": 1.6509039402008057,
+      "learning_rate": 9.97322700801975e-05,
+      "loss": 0.5663,
+      "mean_token_accuracy": 0.7955432832241058,
+      "num_tokens": 990943.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.3173952937126159,
+      "epoch": 0.8256880733944955,
+      "grad_norm": 1.8522167205810547,
+      "learning_rate": 9.971231611195407e-05,
+      "loss": 0.614,
+      "mean_token_accuracy": 0.7815097570419312,
+      "num_tokens": 1005001.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.340037202835083,
+      "epoch": 0.8371559633027523,
+      "grad_norm": 1.4919304847717285,
+      "learning_rate": 9.969164718157538e-05,
+      "loss": 0.6348,
+      "mean_token_accuracy": 0.7702794313430786,
+      "num_tokens": 1018544.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.3305164098739624,
+      "epoch": 0.8486238532110092,
+      "grad_norm": 1.5445469617843628,
+      "learning_rate": 9.967026358632184e-05,
+      "loss": 0.6136,
+      "mean_token_accuracy": 0.77325798869133,
+      "num_tokens": 1032665.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.3210863590240478,
+      "epoch": 0.8600917431192661,
+      "grad_norm": 1.9453340768814087,
+      "learning_rate": 9.964816563373212e-05,
+      "loss": 0.6514,
+      "mean_token_accuracy": 0.7692999839782715,
+      "num_tokens": 1047328.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.327096664905548,
+      "epoch": 0.8715596330275229,
+      "grad_norm": 1.8478624820709229,
+      "learning_rate": 9.962535364161879e-05,
+      "loss": 0.6003,
+      "mean_token_accuracy": 0.7799559772014618,
+      "num_tokens": 1061305.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.3272370457649232,
+      "epoch": 0.8830275229357798,
+      "grad_norm": 1.9946807622909546,
+      "learning_rate": 9.960182793806377e-05,
+      "loss": 0.6315,
+      "mean_token_accuracy": 0.7699635088443756,
+      "num_tokens": 1075123.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.3235833764076232,
+      "epoch": 0.8944954128440367,
+      "grad_norm": 1.500209927558899,
+      "learning_rate": 9.957758886141351e-05,
+      "loss": 0.6527,
+      "mean_token_accuracy": 0.7683537185192109,
+      "num_tokens": 1089084.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.312354290485382,
+      "epoch": 0.9059633027522935,
+      "grad_norm": 1.6548733711242676,
+      "learning_rate": 9.955263676027427e-05,
+      "loss": 0.5927,
+      "mean_token_accuracy": 0.7949600100517273,
+      "num_tokens": 1103963.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.3421159029006957,
+      "epoch": 0.9174311926605505,
+      "grad_norm": 1.5262596607208252,
+      "learning_rate": 9.95269719935069e-05,
+      "loss": 0.6553,
+      "mean_token_accuracy": 0.7679201364517212,
+      "num_tokens": 1117901.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.344819176197052,
+      "epoch": 0.9288990825688074,
+      "grad_norm": 1.42953360080719,
+      "learning_rate": 9.950059493022193e-05,
+      "loss": 0.6607,
+      "mean_token_accuracy": 0.762078708410263,
+      "num_tokens": 1132174.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.3429975152015685,
+      "epoch": 0.9403669724770642,
+      "grad_norm": 1.648417592048645,
+      "learning_rate": 9.947350594977402e-05,
+      "loss": 0.6929,
+      "mean_token_accuracy": 0.7437104344367981,
+      "num_tokens": 1146769.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.3269536972045899,
+      "epoch": 0.9518348623853211,
+      "grad_norm": 1.802235722541809,
+      "learning_rate": 9.944570544175673e-05,
+      "loss": 0.6676,
+      "mean_token_accuracy": 0.7601192831993103,
+      "num_tokens": 1161091.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.3191216468811036,
+      "epoch": 0.963302752293578,
+      "grad_norm": 1.9612555503845215,
+      "learning_rate": 9.941719380599672e-05,
+      "loss": 0.625,
+      "mean_token_accuracy": 0.7729354560375213,
+      "num_tokens": 1173905.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.3115869045257569,
+      "epoch": 0.9747706422018348,
+      "grad_norm": 1.2845028638839722,
+      "learning_rate": 9.93879714525481e-05,
+      "loss": 0.5944,
+      "mean_token_accuracy": 0.7839926242828369,
+      "num_tokens": 1188063.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.3091205954551697,
+      "epoch": 0.9862385321100917,
+      "grad_norm": 1.8383289575576782,
+      "learning_rate": 9.935803880168652e-05,
+      "loss": 0.6237,
+      "mean_token_accuracy": 0.7753754138946534,
+      "num_tokens": 1202695.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.2994250178337097,
+      "epoch": 0.9977064220183486,
+      "grad_norm": 1.571912407875061,
+      "learning_rate": 9.932739628390316e-05,
+      "loss": 0.6456,
+      "mean_token_accuracy": 0.7671150684356689,
+      "num_tokens": 1216684.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.3076510548591613,
+      "epoch": 1.0091743119266054,
+      "grad_norm": 1.8406661748886108,
+      "learning_rate": 9.929604433989843e-05,
+      "loss": 0.6445,
+      "mean_token_accuracy": 0.7758039116859436,
+      "num_tokens": 1229248.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.2624098420143128,
+      "epoch": 1.0206422018348624,
+      "grad_norm": 1.9808402061462402,
+      "learning_rate": 9.926398342057577e-05,
+      "loss": 0.492,
+      "mean_token_accuracy": 0.8236800074577332,
+      "num_tokens": 1243088.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.252714467048645,
+      "epoch": 1.0321100917431192,
+      "grad_norm": 2.2568917274475098,
+      "learning_rate": 9.923121398703504e-05,
+      "loss": 0.4861,
+      "mean_token_accuracy": 0.8282331109046936,
+      "num_tokens": 1256681.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.2762907862663269,
+      "epoch": 1.0435779816513762,
+      "grad_norm": 1.7591499090194702,
+      "learning_rate": 9.9197736510566e-05,
+      "loss": 0.5326,
+      "mean_token_accuracy": 0.8061232268810272,
+      "num_tokens": 1270563.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.2779451608657837,
+      "epoch": 1.0550458715596331,
+      "grad_norm": 1.7618857622146606,
+      "learning_rate": 9.916355147264142e-05,
+      "loss": 0.5762,
+      "mean_token_accuracy": 0.7888909459114075,
+      "num_tokens": 1284789.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.3000144004821776,
+      "epoch": 1.06651376146789,
+      "grad_norm": 1.929226040840149,
+      "learning_rate": 9.912865936491026e-05,
+      "loss": 0.556,
+      "mean_token_accuracy": 0.7985962986946106,
+      "num_tokens": 1298314.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.2920597314834594,
+      "epoch": 1.0779816513761469,
+      "grad_norm": 2.1356875896453857,
+      "learning_rate": 9.909306068919055e-05,
+      "loss": 0.5872,
+      "mean_token_accuracy": 0.7914662003517151,
+      "num_tokens": 1312524.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.3042231440544128,
+      "epoch": 1.0894495412844036,
+      "grad_norm": 2.148797035217285,
+      "learning_rate": 9.905675595746215e-05,
+      "loss": 0.5507,
+      "mean_token_accuracy": 0.802655827999115,
+      "num_tokens": 1326952.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.277776312828064,
+      "epoch": 1.1009174311926606,
+      "grad_norm": 1.6280494928359985,
+      "learning_rate": 9.901974569185941e-05,
+      "loss": 0.5579,
+      "mean_token_accuracy": 0.8001268386840821,
+      "num_tokens": 1341302.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.2962275981903075,
+      "epoch": 1.1123853211009174,
+      "grad_norm": 1.8065513372421265,
+      "learning_rate": 9.898203042466368e-05,
+      "loss": 0.5492,
+      "mean_token_accuracy": 0.8058996260166168,
+      "num_tokens": 1355689.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.2893213629722595,
+      "epoch": 1.1238532110091743,
+      "grad_norm": 1.864761233329773,
+      "learning_rate": 9.894361069829565e-05,
+      "loss": 0.5292,
+      "mean_token_accuracy": 0.8077204465866089,
+      "num_tokens": 1369850.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.2918407797813416,
+      "epoch": 1.135321100917431,
+      "grad_norm": 2.276775598526001,
+      "learning_rate": 9.89044870653075e-05,
+      "loss": 0.564,
+      "mean_token_accuracy": 0.7952383041381836,
+      "num_tokens": 1384054.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.281248104572296,
+      "epoch": 1.146788990825688,
+      "grad_norm": 2.1157305240631104,
+      "learning_rate": 9.886466008837503e-05,
+      "loss": 0.5706,
+      "mean_token_accuracy": 0.7949798464775085,
+      "num_tokens": 1398492.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.2710728526115418,
+      "epoch": 1.158256880733945,
+      "grad_norm": 1.8817031383514404,
+      "learning_rate": 9.882413034028948e-05,
+      "loss": 0.516,
+      "mean_token_accuracy": 0.8137441635131836,
+      "num_tokens": 1412100.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.2870657205581666,
+      "epoch": 1.1697247706422018,
+      "grad_norm": 1.7975279092788696,
+      "learning_rate": 9.878289840394938e-05,
+      "loss": 0.5374,
+      "mean_token_accuracy": 0.8032542705535889,
+      "num_tokens": 1425770.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.2666459918022155,
+      "epoch": 1.1811926605504588,
+      "grad_norm": 2.47218656539917,
+      "learning_rate": 9.874096487235212e-05,
+      "loss": 0.5158,
+      "mean_token_accuracy": 0.8173266768455505,
+      "num_tokens": 1439309.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.3137032628059386,
+      "epoch": 1.1926605504587156,
+      "grad_norm": 1.7813074588775635,
+      "learning_rate": 9.869833034858538e-05,
+      "loss": 0.5324,
+      "mean_token_accuracy": 0.8099446773529053,
+      "num_tokens": 1454541.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.2864318251609803,
+      "epoch": 1.2041284403669725,
+      "grad_norm": 1.9276366233825684,
+      "learning_rate": 9.86549954458186e-05,
+      "loss": 0.5554,
+      "mean_token_accuracy": 0.8048118472099304,
+      "num_tokens": 1468346.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.2949382424354554,
+      "epoch": 1.2155963302752293,
+      "grad_norm": 1.9171100854873657,
+      "learning_rate": 9.861096078729396e-05,
+      "loss": 0.5857,
+      "mean_token_accuracy": 0.7923648238182068,
+      "num_tokens": 1482839.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.2825786828994752,
+      "epoch": 1.2270642201834863,
+      "grad_norm": 1.458295226097107,
+      "learning_rate": 9.85662270063176e-05,
+      "loss": 0.5344,
+      "mean_token_accuracy": 0.8081244885921478,
+      "num_tokens": 1496532.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.2934918642044066,
+      "epoch": 1.238532110091743,
+      "grad_norm": 2.2048583030700684,
+      "learning_rate": 9.852079474625035e-05,
+      "loss": 0.5802,
+      "mean_token_accuracy": 0.7943230092525482,
+      "num_tokens": 1510406.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.3103590607643127,
+      "epoch": 1.25,
+      "grad_norm": 2.103316307067871,
+      "learning_rate": 9.847466466049868e-05,
+      "loss": 0.5761,
+      "mean_token_accuracy": 0.7919000566005707,
+      "num_tokens": 1524582.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.2943686366081237,
+      "epoch": 1.261467889908257,
+      "grad_norm": 1.8935585021972656,
+      "learning_rate": 9.84278374125051e-05,
+      "loss": 0.5668,
+      "mean_token_accuracy": 0.795119684934616,
+      "num_tokens": 1538645.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.2833523988723754,
+      "epoch": 1.2729357798165137,
+      "grad_norm": 1.5310587882995605,
+      "learning_rate": 9.838031367573868e-05,
+      "loss": 0.4791,
+      "mean_token_accuracy": 0.8290136575698852,
+      "num_tokens": 1552198.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.2810697436332703,
+      "epoch": 1.2844036697247707,
+      "grad_norm": 1.9493242502212524,
+      "learning_rate": 9.833209413368546e-05,
+      "loss": 0.5479,
+      "mean_token_accuracy": 0.7984305679798126,
+      "num_tokens": 1566248.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.2971422672271729,
+      "epoch": 1.2958715596330275,
+      "grad_norm": 2.143052816390991,
+      "learning_rate": 9.828317947983851e-05,
+      "loss": 0.5556,
+      "mean_token_accuracy": 0.7962001860141754,
+      "num_tokens": 1579657.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.2938915967941285,
+      "epoch": 1.3073394495412844,
+      "grad_norm": 3.074519395828247,
+      "learning_rate": 9.823357041768797e-05,
+      "loss": 0.5808,
+      "mean_token_accuracy": 0.7921633243560791,
+      "num_tokens": 1594362.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.3013799428939818,
+      "epoch": 1.3188073394495412,
+      "grad_norm": 2.1249051094055176,
+      "learning_rate": 9.8183267660711e-05,
+      "loss": 0.5679,
+      "mean_token_accuracy": 0.7960763275623322,
+      "num_tokens": 1607995.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.2755417585372926,
+      "epoch": 1.3302752293577982,
+      "grad_norm": 1.7334320545196533,
+      "learning_rate": 9.813227193236144e-05,
+      "loss": 0.5211,
+      "mean_token_accuracy": 0.8171180784702301,
+      "num_tokens": 1621183.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.300136685371399,
+      "epoch": 1.341743119266055,
+      "grad_norm": 1.604264259338379,
+      "learning_rate": 9.808058396605945e-05,
+      "loss": 0.5622,
+      "mean_token_accuracy": 0.7956745982170105,
+      "num_tokens": 1634961.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.2956653475761413,
+      "epoch": 1.353211009174312,
+      "grad_norm": 2.304135322570801,
+      "learning_rate": 9.802820450518095e-05,
+      "loss": 0.5919,
+      "mean_token_accuracy": 0.7799835622310638,
+      "num_tokens": 1648959.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.3270721554756164,
+      "epoch": 1.364678899082569,
+      "grad_norm": 2.304185390472412,
+      "learning_rate": 9.797513430304695e-05,
+      "loss": 0.6347,
+      "mean_token_accuracy": 0.7729239940643311,
+      "num_tokens": 1662218.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.3200181603431702,
+      "epoch": 1.3761467889908257,
+      "grad_norm": 2.673722743988037,
+      "learning_rate": 9.792137412291265e-05,
+      "loss": 0.6568,
+      "mean_token_accuracy": 0.7654553771018981,
+      "num_tokens": 1675320.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.3001809120178223,
+      "epoch": 1.3876146788990826,
+      "grad_norm": 1.8785172700881958,
+      "learning_rate": 9.786692473795654e-05,
+      "loss": 0.5498,
+      "mean_token_accuracy": 0.7971892893314362,
+      "num_tokens": 1688732.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.2927094459533692,
+      "epoch": 1.3990825688073394,
+      "grad_norm": 2.299051284790039,
+      "learning_rate": 9.781178693126923e-05,
+      "loss": 0.5317,
+      "mean_token_accuracy": 0.812885046005249,
+      "num_tokens": 1702489.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.2940443515777589,
+      "epoch": 1.4105504587155964,
+      "grad_norm": 2.107447385787964,
+      "learning_rate": 9.775596149584226e-05,
+      "loss": 0.5408,
+      "mean_token_accuracy": 0.8026755452156067,
+      "num_tokens": 1717066.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.2880491733551025,
+      "epoch": 1.4220183486238533,
+      "grad_norm": 2.120649814605713,
+      "learning_rate": 9.769944923455654e-05,
+      "loss": 0.5122,
+      "mean_token_accuracy": 0.8185527265071869,
+      "num_tokens": 1730503.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.2935888648033143,
+      "epoch": 1.43348623853211,
+      "grad_norm": 1.8897229433059692,
+      "learning_rate": 9.764225096017102e-05,
+      "loss": 0.5891,
+      "mean_token_accuracy": 0.7794159233570099,
+      "num_tokens": 1744257.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.2713160991668702,
+      "epoch": 1.4449541284403669,
+      "grad_norm": 1.9189554452896118,
+      "learning_rate": 9.758436749531079e-05,
+      "loss": 0.5146,
+      "mean_token_accuracy": 0.818141633272171,
+      "num_tokens": 1758267.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.2798304796218871,
+      "epoch": 1.4564220183486238,
+      "grad_norm": 2.2521767616271973,
+      "learning_rate": 9.752579967245538e-05,
+      "loss": 0.5959,
+      "mean_token_accuracy": 0.7902258694171905,
+      "num_tokens": 1771990.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.296580719947815,
+      "epoch": 1.4678899082568808,
+      "grad_norm": 1.5478334426879883,
+      "learning_rate": 9.746654833392677e-05,
+      "loss": 0.5636,
+      "mean_token_accuracy": 0.8009288847446442,
+      "num_tokens": 1786045.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.2467906951904297,
+      "epoch": 1.4793577981651376,
+      "grad_norm": 1.8531265258789062,
+      "learning_rate": 9.740661433187725e-05,
+      "loss": 0.4514,
+      "mean_token_accuracy": 0.8369600057601929,
+      "num_tokens": 1800019.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.2813060760498047,
+      "epoch": 1.4908256880733946,
+      "grad_norm": 2.007786512374878,
+      "learning_rate": 9.734599852827712e-05,
+      "loss": 0.5587,
+      "mean_token_accuracy": 0.8045243263244629,
+      "num_tokens": 1814394.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.2923226833343506,
+      "epoch": 1.5022935779816513,
+      "grad_norm": 2.0562584400177,
+      "learning_rate": 9.728470179490244e-05,
+      "loss": 0.563,
+      "mean_token_accuracy": 0.79967080950737,
+      "num_tokens": 1827604.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.28248028755188,
+      "epoch": 1.5137614678899083,
+      "grad_norm": 1.8021918535232544,
+      "learning_rate": 9.72227250133223e-05,
+      "loss": 0.5535,
+      "mean_token_accuracy": 0.8028985977172851,
+      "num_tokens": 1841751.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.2800176739692688,
+      "epoch": 1.5252293577981653,
+      "grad_norm": 2.0901622772216797,
+      "learning_rate": 9.71600690748863e-05,
+      "loss": 0.5889,
+      "mean_token_accuracy": 0.7968101024627685,
+      "num_tokens": 1856403.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.2775539755821228,
+      "epoch": 1.536697247706422,
+      "grad_norm": 1.9024734497070312,
+      "learning_rate": 9.709673488071163e-05,
+      "loss": 0.5529,
+      "mean_token_accuracy": 0.7998219549655914,
+      "num_tokens": 1870952.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.3066880822181701,
+      "epoch": 1.5481651376146788,
+      "grad_norm": 2.2026913166046143,
+      "learning_rate": 9.70327233416702e-05,
+      "loss": 0.6146,
+      "mean_token_accuracy": 0.7799036145210266,
+      "num_tokens": 1884850.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.2854471683502198,
+      "epoch": 1.5596330275229358,
+      "grad_norm": 1.995058298110962,
+      "learning_rate": 9.696803537837542e-05,
+      "loss": 0.5744,
+      "mean_token_accuracy": 0.7955298364162445,
+      "num_tokens": 1898895.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.2856696963310241,
+      "epoch": 1.5711009174311927,
+      "grad_norm": 1.913603663444519,
+      "learning_rate": 9.690267192116908e-05,
+      "loss": 0.525,
+      "mean_token_accuracy": 0.8169679343700409,
+      "num_tokens": 1913026.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.3183680534362794,
+      "epoch": 1.5825688073394495,
+      "grad_norm": 2.7248916625976562,
+      "learning_rate": 9.683663391010791e-05,
+      "loss": 0.6482,
+      "mean_token_accuracy": 0.7678777754306794,
+      "num_tokens": 1927053.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.298743522167206,
+      "epoch": 1.5940366972477065,
+      "grad_norm": 2.011831521987915,
+      "learning_rate": 9.676992229495004e-05,
+      "loss": 0.577,
+      "mean_token_accuracy": 0.7876397609710694,
+      "num_tokens": 1940596.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.294689130783081,
+      "epoch": 1.6055045871559632,
+      "grad_norm": 2.2598249912261963,
+      "learning_rate": 9.670253803514142e-05,
+      "loss": 0.5746,
+      "mean_token_accuracy": 0.7938637971878052,
+      "num_tokens": 1955635.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.3118200659751893,
+      "epoch": 1.6169724770642202,
+      "grad_norm": 1.9109872579574585,
+      "learning_rate": 9.66344820998019e-05,
+      "loss": 0.5996,
+      "mean_token_accuracy": 0.7869695067405701,
+      "num_tokens": 1970187.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.2969690084457397,
+      "epoch": 1.6284403669724772,
+      "grad_norm": 2.021652936935425,
+      "learning_rate": 9.656575546771144e-05,
+      "loss": 0.5692,
+      "mean_token_accuracy": 0.7921172618865967,
+      "num_tokens": 1983963.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.3053216218948365,
+      "epoch": 1.639908256880734,
+      "grad_norm": 2.056626081466675,
+      "learning_rate": 9.649635912729589e-05,
+      "loss": 0.5534,
+      "mean_token_accuracy": 0.7994763553142548,
+      "num_tokens": 1997426.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.307614517211914,
+      "epoch": 1.6513761467889907,
+      "grad_norm": 2.0294957160949707,
+      "learning_rate": 9.642629407661288e-05,
+      "loss": 0.6113,
+      "mean_token_accuracy": 0.7812033116817474,
+      "num_tokens": 2011810.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.2840725421905517,
+      "epoch": 1.6628440366972477,
+      "grad_norm": 2.376054525375366,
+      "learning_rate": 9.63555613233374e-05,
+      "loss": 0.5333,
+      "mean_token_accuracy": 0.8069488048553467,
+      "num_tokens": 2025702.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.2848711609840393,
+      "epoch": 1.6743119266055047,
+      "grad_norm": 2.387098550796509,
+      "learning_rate": 9.628416188474735e-05,
+      "loss": 0.5295,
+      "mean_token_accuracy": 0.8113990724086761,
+      "num_tokens": 2040039.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.3038938522338868,
+      "epoch": 1.6857798165137616,
+      "grad_norm": 2.6049790382385254,
+      "learning_rate": 9.621209678770889e-05,
+      "loss": 0.5902,
+      "mean_token_accuracy": 0.7839356422424316,
+      "num_tokens": 2054883.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.3001854181289674,
+      "epoch": 1.6972477064220184,
+      "grad_norm": 2.08150577545166,
+      "learning_rate": 9.613936706866168e-05,
+      "loss": 0.5804,
+      "mean_token_accuracy": 0.7912817001342773,
+      "num_tokens": 2068892.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.2911452770233154,
+      "epoch": 1.7087155963302751,
+      "grad_norm": 2.2386717796325684,
+      "learning_rate": 9.606597377360396e-05,
+      "loss": 0.5902,
+      "mean_token_accuracy": 0.7858116149902343,
+      "num_tokens": 2083075.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.2923203349113463,
+      "epoch": 1.7201834862385321,
+      "grad_norm": 1.9360357522964478,
+      "learning_rate": 9.59919179580775e-05,
+      "loss": 0.5931,
+      "mean_token_accuracy": 0.7880455732345581,
+      "num_tokens": 2097088.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.2811247110366821,
+      "epoch": 1.731651376146789,
+      "grad_norm": 2.346832275390625,
+      "learning_rate": 9.591720068715247e-05,
+      "loss": 0.5381,
+      "mean_token_accuracy": 0.8110429465770721,
+      "num_tokens": 2110713.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.2997817516326904,
+      "epoch": 1.7431192660550459,
+      "grad_norm": 2.1013338565826416,
+      "learning_rate": 9.584182303541205e-05,
+      "loss": 0.5771,
+      "mean_token_accuracy": 0.7898500382900238,
+      "num_tokens": 2124467.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.283075988292694,
+      "epoch": 1.7545871559633026,
+      "grad_norm": 1.718410849571228,
+      "learning_rate": 9.576578608693703e-05,
+      "loss": 0.5545,
+      "mean_token_accuracy": 0.8036096036434174,
+      "num_tokens": 2139017.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.2541950225830079,
+      "epoch": 1.7660550458715596,
+      "grad_norm": 2.381345510482788,
+      "learning_rate": 9.568909093529022e-05,
+      "loss": 0.5071,
+      "mean_token_accuracy": 0.8172869801521301,
+      "num_tokens": 2153212.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.2600136041641234,
+      "epoch": 1.7775229357798166,
+      "grad_norm": 1.9568657875061035,
+      "learning_rate": 9.561173868350067e-05,
+      "loss": 0.5251,
+      "mean_token_accuracy": 0.8089884519577026,
+      "num_tokens": 2167190.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.2688735485076905,
+      "epoch": 1.7889908256880735,
+      "grad_norm": 2.0126872062683105,
+      "learning_rate": 9.553373044404783e-05,
+      "loss": 0.5563,
+      "mean_token_accuracy": 0.8013049483299255,
+      "num_tokens": 2181135.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.2632331728935242,
+      "epoch": 1.8004587155963303,
+      "grad_norm": 1.7177560329437256,
+      "learning_rate": 9.54550673388456e-05,
+      "loss": 0.5456,
+      "mean_token_accuracy": 0.8039442837238312,
+      "num_tokens": 2195099.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.2656291127204895,
+      "epoch": 1.811926605504587,
+      "grad_norm": 2.6126630306243896,
+      "learning_rate": 9.537575049922613e-05,
+      "loss": 0.5516,
+      "mean_token_accuracy": 0.7961392283439637,
+      "num_tokens": 2209220.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.278434193134308,
+      "epoch": 1.823394495412844,
+      "grad_norm": 2.216356039047241,
+      "learning_rate": 9.52957810659236e-05,
+      "loss": 0.548,
+      "mean_token_accuracy": 0.7977044761180878,
+      "num_tokens": 2222873.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.285041868686676,
+      "epoch": 1.834862385321101,
+      "grad_norm": 2.2278988361358643,
+      "learning_rate": 9.521516018905771e-05,
+      "loss": 0.5905,
+      "mean_token_accuracy": 0.7802383601665497,
+      "num_tokens": 2237054.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.2938857316970824,
+      "epoch": 1.8463302752293578,
+      "grad_norm": 2.0378856658935547,
+      "learning_rate": 9.513388902811733e-05,
+      "loss": 0.6033,
+      "mean_token_accuracy": 0.7891092479228974,
+      "num_tokens": 2250581.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.2730875372886659,
+      "epoch": 1.8577981651376145,
+      "grad_norm": 1.9576410055160522,
+      "learning_rate": 9.505196875194362e-05,
+      "loss": 0.5709,
+      "mean_token_accuracy": 0.7948619246482849,
+      "num_tokens": 2264352.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.2942588448524475,
+      "epoch": 1.8692660550458715,
+      "grad_norm": 3.2486989498138428,
+      "learning_rate": 9.496940053871333e-05,
+      "loss": 0.5695,
+      "mean_token_accuracy": 0.7931654870510101,
+      "num_tokens": 2278395.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.2859179735183717,
+      "epoch": 1.8807339449541285,
+      "grad_norm": 1.7161357402801514,
+      "learning_rate": 9.488618557592187e-05,
+      "loss": 0.5588,
+      "mean_token_accuracy": 0.7988445639610291,
+      "num_tokens": 2292458.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.287862777709961,
+      "epoch": 1.8922018348623855,
+      "grad_norm": 1.7279341220855713,
+      "learning_rate": 9.480232506036618e-05,
+      "loss": 0.5718,
+      "mean_token_accuracy": 0.7963582694530487,
+      "num_tokens": 2305950.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.2868569016456604,
+      "epoch": 1.9036697247706422,
+      "grad_norm": 1.7532700300216675,
+      "learning_rate": 9.471782019812748e-05,
+      "loss": 0.5739,
+      "mean_token_accuracy": 0.7951330602169037,
+      "num_tokens": 2320092.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.2931817889213562,
+      "epoch": 1.915137614678899,
+      "grad_norm": 2.7232377529144287,
+      "learning_rate": 9.463267220455408e-05,
+      "loss": 0.5996,
+      "mean_token_accuracy": 0.7812487840652466,
+      "num_tokens": 2334035.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.2847351789474488,
+      "epoch": 1.926605504587156,
+      "grad_norm": 2.1023809909820557,
+      "learning_rate": 9.454688230424372e-05,
+      "loss": 0.5516,
+      "mean_token_accuracy": 0.8027086973190307,
+      "num_tokens": 2348205.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.2944233417510986,
+      "epoch": 1.938073394495413,
+      "grad_norm": 2.789158582687378,
+      "learning_rate": 9.446045173102607e-05,
+      "loss": 0.6096,
+      "mean_token_accuracy": 0.7904924273490905,
+      "num_tokens": 2362411.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.296918225288391,
+      "epoch": 1.9495412844036697,
+      "grad_norm": 2.8648757934570312,
+      "learning_rate": 9.437338172794495e-05,
+      "loss": 0.5851,
+      "mean_token_accuracy": 0.7826291382312774,
+      "num_tokens": 2376229.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.2416040658950807,
+      "epoch": 1.9610091743119265,
+      "grad_norm": 2.146327257156372,
+      "learning_rate": 9.428567354724047e-05,
+      "loss": 0.5003,
+      "mean_token_accuracy": 0.8209156513214111,
+      "num_tokens": 2389870.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.299118459224701,
+      "epoch": 1.9724770642201834,
+      "grad_norm": 1.9699536561965942,
+      "learning_rate": 9.419732845033093e-05,
+      "loss": 0.5857,
+      "mean_token_accuracy": 0.7884073138237,
+      "num_tokens": 2403887.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.307542335987091,
+      "epoch": 1.9839449541284404,
+      "grad_norm": 2.541121006011963,
+      "learning_rate": 9.410834770779489e-05,
+      "loss": 0.6299,
+      "mean_token_accuracy": 0.7736253619194031,
+      "num_tokens": 2418109.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.2949981808662414,
+      "epoch": 1.9954128440366974,
+      "grad_norm": 1.7402102947235107,
+      "learning_rate": 9.401873259935261e-05,
+      "loss": 0.5928,
+      "mean_token_accuracy": 0.7905942320823669,
+      "num_tokens": 2432561.0,
+      "step": 1740
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.036845473218007e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:719f5fb00afaa0b7a771b34e32c90e065e8a706f3a2f57195703efc1935853dc
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2644 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 2616,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2426587224006653,
+      "epoch": 0.011467889908256881,
+      "grad_norm": 3.028918981552124,
+      "learning_rate": 2.0642201834862385e-06,
+      "loss": 0.6546,
+      "mean_token_accuracy": 0.7775660812854767,
+      "num_tokens": 14294.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2369988679885864,
+      "epoch": 0.022935779816513763,
+      "grad_norm": 2.726299524307251,
+      "learning_rate": 4.357798165137615e-06,
+      "loss": 0.6567,
+      "mean_token_accuracy": 0.7740493595600129,
+      "num_tokens": 28037.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2392263770103455,
+      "epoch": 0.034403669724770644,
+      "grad_norm": 2.2062387466430664,
+      "learning_rate": 6.651376146788992e-06,
+      "loss": 0.6411,
+      "mean_token_accuracy": 0.7753027558326722,
+      "num_tokens": 42268.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2435187816619873,
+      "epoch": 0.045871559633027525,
+      "grad_norm": 2.4188315868377686,
+      "learning_rate": 8.944954128440369e-06,
+      "loss": 0.6271,
+      "mean_token_accuracy": 0.7827982664108276,
+      "num_tokens": 56474.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.261301326751709,
+      "epoch": 0.05733944954128441,
+      "grad_norm": 2.490365982055664,
+      "learning_rate": 1.1238532110091744e-05,
+      "loss": 0.6343,
+      "mean_token_accuracy": 0.7792274355888367,
+      "num_tokens": 70348.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2830254554748535,
+      "epoch": 0.06880733944954129,
+      "grad_norm": 2.6820173263549805,
+      "learning_rate": 1.3532110091743119e-05,
+      "loss": 0.6298,
+      "mean_token_accuracy": 0.7811978995800019,
+      "num_tokens": 84857.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.275150680541992,
+      "epoch": 0.08027522935779817,
+      "grad_norm": 1.8796888589859009,
+      "learning_rate": 1.5825688073394497e-05,
+      "loss": 0.6689,
+      "mean_token_accuracy": 0.7587406218051911,
+      "num_tokens": 99396.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2276832342147828,
+      "epoch": 0.09174311926605505,
+      "grad_norm": 1.690596580505371,
+      "learning_rate": 1.811926605504587e-05,
+      "loss": 0.5586,
+      "mean_token_accuracy": 0.8031339049339294,
+      "num_tokens": 112619.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.2304488182067872,
+      "epoch": 0.10321100917431193,
+      "grad_norm": 1.7719478607177734,
+      "learning_rate": 2.0412844036697248e-05,
+      "loss": 0.6093,
+      "mean_token_accuracy": 0.7849110841751099,
+      "num_tokens": 126496.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.2812824487686156,
+      "epoch": 0.11467889908256881,
+      "grad_norm": 1.5947109460830688,
+      "learning_rate": 2.2706422018348624e-05,
+      "loss": 0.6455,
+      "mean_token_accuracy": 0.7670063555240632,
+      "num_tokens": 140628.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.2834580063819885,
+      "epoch": 0.12614678899082568,
+      "grad_norm": 1.8809813261032104,
+      "learning_rate": 2.5e-05,
+      "loss": 0.6489,
+      "mean_token_accuracy": 0.7716078400611878,
+      "num_tokens": 153959.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.2611944794654846,
+      "epoch": 0.13761467889908258,
+      "grad_norm": 1.780765414237976,
+      "learning_rate": 2.7293577981651375e-05,
+      "loss": 0.5445,
+      "mean_token_accuracy": 0.8067175269126892,
+      "num_tokens": 168207.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.272983717918396,
+      "epoch": 0.14908256880733944,
+      "grad_norm": 2.119795560836792,
+      "learning_rate": 2.9587155963302755e-05,
+      "loss": 0.6021,
+      "mean_token_accuracy": 0.7827390134334564,
+      "num_tokens": 182042.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.2775203227996825,
+      "epoch": 0.16055045871559634,
+      "grad_norm": 1.4455509185791016,
+      "learning_rate": 3.188073394495413e-05,
+      "loss": 0.5715,
+      "mean_token_accuracy": 0.7992757976055145,
+      "num_tokens": 196015.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.2987091898918153,
+      "epoch": 0.1720183486238532,
+      "grad_norm": 1.4980850219726562,
+      "learning_rate": 3.4174311926605505e-05,
+      "loss": 0.6023,
+      "mean_token_accuracy": 0.7867661654949188,
+      "num_tokens": 210215.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.2755884647369384,
+      "epoch": 0.1834862385321101,
+      "grad_norm": 1.8361093997955322,
+      "learning_rate": 3.646788990825688e-05,
+      "loss": 0.6042,
+      "mean_token_accuracy": 0.7830365121364593,
+      "num_tokens": 224017.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.2698514223098756,
+      "epoch": 0.19495412844036697,
+      "grad_norm": 1.3812596797943115,
+      "learning_rate": 3.876146788990826e-05,
+      "loss": 0.5846,
+      "mean_token_accuracy": 0.7849388122558594,
+      "num_tokens": 237482.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.244412088394165,
+      "epoch": 0.20642201834862386,
+      "grad_norm": 1.798601746559143,
+      "learning_rate": 4.1055045871559636e-05,
+      "loss": 0.547,
+      "mean_token_accuracy": 0.8001754701137542,
+      "num_tokens": 251169.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.2745447516441346,
+      "epoch": 0.21788990825688073,
+      "grad_norm": 1.9844188690185547,
+      "learning_rate": 4.334862385321101e-05,
+      "loss": 0.5623,
+      "mean_token_accuracy": 0.7987869799137115,
+      "num_tokens": 264764.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.3000339031219483,
+      "epoch": 0.22935779816513763,
+      "grad_norm": 1.4861323833465576,
+      "learning_rate": 4.564220183486239e-05,
+      "loss": 0.6123,
+      "mean_token_accuracy": 0.7852487206459046,
+      "num_tokens": 278504.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.3115605235099792,
+      "epoch": 0.2408256880733945,
+      "grad_norm": 1.8251842260360718,
+      "learning_rate": 4.7935779816513766e-05,
+      "loss": 0.658,
+      "mean_token_accuracy": 0.759744155406952,
+      "num_tokens": 293021.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.3224670886993408,
+      "epoch": 0.25229357798165136,
+      "grad_norm": 1.5895862579345703,
+      "learning_rate": 5.022935779816514e-05,
+      "loss": 0.6213,
+      "mean_token_accuracy": 0.7748652398586273,
+      "num_tokens": 306811.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.2993175029754638,
+      "epoch": 0.26376146788990823,
+      "grad_norm": 1.6444000005722046,
+      "learning_rate": 5.252293577981652e-05,
+      "loss": 0.5723,
+      "mean_token_accuracy": 0.7920855104923248,
+      "num_tokens": 320873.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.3019076824188232,
+      "epoch": 0.27522935779816515,
+      "grad_norm": 1.5319479703903198,
+      "learning_rate": 5.481651376146789e-05,
+      "loss": 0.5945,
+      "mean_token_accuracy": 0.7845574736595153,
+      "num_tokens": 335323.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.289823544025421,
+      "epoch": 0.286697247706422,
+      "grad_norm": 1.2900819778442383,
+      "learning_rate": 5.7110091743119266e-05,
+      "loss": 0.5692,
+      "mean_token_accuracy": 0.7950143396854401,
+      "num_tokens": 349575.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.2885443449020386,
+      "epoch": 0.2981651376146789,
+      "grad_norm": 1.408677577972412,
+      "learning_rate": 5.940366972477065e-05,
+      "loss": 0.6104,
+      "mean_token_accuracy": 0.7816850125789643,
+      "num_tokens": 363961.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.3065945267677308,
+      "epoch": 0.30963302752293576,
+      "grad_norm": 1.3809661865234375,
+      "learning_rate": 6.169724770642203e-05,
+      "loss": 0.6426,
+      "mean_token_accuracy": 0.7661891877651215,
+      "num_tokens": 377555.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.2889371395111084,
+      "epoch": 0.3211009174311927,
+      "grad_norm": 1.4974966049194336,
+      "learning_rate": 6.39908256880734e-05,
+      "loss": 0.5882,
+      "mean_token_accuracy": 0.7819968700408936,
+      "num_tokens": 391423.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.2952162742614746,
+      "epoch": 0.33256880733944955,
+      "grad_norm": 1.3621913194656372,
+      "learning_rate": 6.628440366972477e-05,
+      "loss": 0.57,
+      "mean_token_accuracy": 0.7946897804737091,
+      "num_tokens": 405650.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.2888988494873046,
+      "epoch": 0.3440366972477064,
+      "grad_norm": 1.793961524963379,
+      "learning_rate": 6.857798165137616e-05,
+      "loss": 0.6273,
+      "mean_token_accuracy": 0.7732390701770783,
+      "num_tokens": 419332.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.289334809780121,
+      "epoch": 0.3555045871559633,
+      "grad_norm": 1.5518903732299805,
+      "learning_rate": 7.087155963302753e-05,
+      "loss": 0.6492,
+      "mean_token_accuracy": 0.757664144039154,
+      "num_tokens": 433432.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.2899688124656676,
+      "epoch": 0.3669724770642202,
+      "grad_norm": 1.5826157331466675,
+      "learning_rate": 7.31651376146789e-05,
+      "loss": 0.5805,
+      "mean_token_accuracy": 0.7921296834945679,
+      "num_tokens": 447592.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.2863509058952332,
+      "epoch": 0.37844036697247707,
+      "grad_norm": 1.7210900783538818,
+      "learning_rate": 7.545871559633027e-05,
+      "loss": 0.5926,
+      "mean_token_accuracy": 0.7852405548095703,
+      "num_tokens": 462489.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.2890722513198853,
+      "epoch": 0.38990825688073394,
+      "grad_norm": 1.6051267385482788,
+      "learning_rate": 7.775229357798165e-05,
+      "loss": 0.6173,
+      "mean_token_accuracy": 0.7741429924964904,
+      "num_tokens": 476591.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.313070333003998,
+      "epoch": 0.4013761467889908,
+      "grad_norm": 1.7080140113830566,
+      "learning_rate": 8.004587155963303e-05,
+      "loss": 0.6165,
+      "mean_token_accuracy": 0.7842044055461883,
+      "num_tokens": 491338.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.2972561955451964,
+      "epoch": 0.41284403669724773,
+      "grad_norm": 1.7454527616500854,
+      "learning_rate": 8.23394495412844e-05,
+      "loss": 0.5927,
+      "mean_token_accuracy": 0.7841840922832489,
+      "num_tokens": 505152.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.2944241881370544,
+      "epoch": 0.4243119266055046,
+      "grad_norm": 1.8223613500595093,
+      "learning_rate": 8.463302752293578e-05,
+      "loss": 0.5862,
+      "mean_token_accuracy": 0.7846642255783081,
+      "num_tokens": 519536.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.2918418169021606,
+      "epoch": 0.43577981651376146,
+      "grad_norm": 1.323716640472412,
+      "learning_rate": 8.692660550458716e-05,
+      "loss": 0.5761,
+      "mean_token_accuracy": 0.788896131515503,
+      "num_tokens": 533610.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.3106001019477844,
+      "epoch": 0.44724770642201833,
+      "grad_norm": 2.1389827728271484,
+      "learning_rate": 8.922018348623854e-05,
+      "loss": 0.6442,
+      "mean_token_accuracy": 0.7677759766578675,
+      "num_tokens": 547213.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.2924273014068604,
+      "epoch": 0.45871559633027525,
+      "grad_norm": 1.3077127933502197,
+      "learning_rate": 9.151376146788991e-05,
+      "loss": 0.6044,
+      "mean_token_accuracy": 0.7855095267295837,
+      "num_tokens": 560707.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.3057442545890807,
+      "epoch": 0.4701834862385321,
+      "grad_norm": 1.658679723739624,
+      "learning_rate": 9.380733944954129e-05,
+      "loss": 0.5803,
+      "mean_token_accuracy": 0.7926251292228699,
+      "num_tokens": 574533.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.3044120788574218,
+      "epoch": 0.481651376146789,
+      "grad_norm": 1.7965151071548462,
+      "learning_rate": 9.610091743119267e-05,
+      "loss": 0.5984,
+      "mean_token_accuracy": 0.7874112606048584,
+      "num_tokens": 587931.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.3121570587158202,
+      "epoch": 0.49311926605504586,
+      "grad_norm": 1.1833796501159668,
+      "learning_rate": 9.839449541284404e-05,
+      "loss": 0.6231,
+      "mean_token_accuracy": 0.7761680126190186,
+      "num_tokens": 602080.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.3229384422302246,
+      "epoch": 0.5045871559633027,
+      "grad_norm": 1.98506760597229,
+      "learning_rate": 9.99999676404826e-05,
+      "loss": 0.6223,
+      "mean_token_accuracy": 0.774652361869812,
+      "num_tokens": 615535.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.2842121720314026,
+      "epoch": 0.5160550458715596,
+      "grad_norm": 1.8412768840789795,
+      "learning_rate": 9.999939236133826e-05,
+      "loss": 0.5968,
+      "mean_token_accuracy": 0.7840604305267334,
+      "num_tokens": 628767.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.3064908266067505,
+      "epoch": 0.5275229357798165,
+      "grad_norm": 1.7538436651229858,
+      "learning_rate": 9.999809799133033e-05,
+      "loss": 0.6244,
+      "mean_token_accuracy": 0.7701604008674622,
+      "num_tokens": 642874.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.3011385202407837,
+      "epoch": 0.5389908256880734,
+      "grad_norm": 2.0401413440704346,
+      "learning_rate": 9.99960845490744e-05,
+      "loss": 0.5897,
+      "mean_token_accuracy": 0.7876223146915435,
+      "num_tokens": 656374.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.3175038933753966,
+      "epoch": 0.5504587155963303,
+      "grad_norm": 1.5815656185150146,
+      "learning_rate": 9.999335206352783e-05,
+      "loss": 0.6681,
+      "mean_token_accuracy": 0.7586038947105408,
+      "num_tokens": 670397.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.3054586052894592,
+      "epoch": 0.5619266055045872,
+      "grad_norm": 1.7010897397994995,
+      "learning_rate": 9.998990057398916e-05,
+      "loss": 0.6488,
+      "mean_token_accuracy": 0.7646380603313446,
+      "num_tokens": 684143.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.2969472885131836,
+      "epoch": 0.573394495412844,
+      "grad_norm": 2.1294353008270264,
+      "learning_rate": 9.998573013009771e-05,
+      "loss": 0.6505,
+      "mean_token_accuracy": 0.7664439141750335,
+      "num_tokens": 697427.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.3074483752250672,
+      "epoch": 0.5848623853211009,
+      "grad_norm": 2.1683812141418457,
+      "learning_rate": 9.998084079183276e-05,
+      "loss": 0.5897,
+      "mean_token_accuracy": 0.7885696291923523,
+      "num_tokens": 711947.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.2956400752067565,
+      "epoch": 0.5963302752293578,
+      "grad_norm": 1.4167346954345703,
+      "learning_rate": 9.997523262951274e-05,
+      "loss": 0.6388,
+      "mean_token_accuracy": 0.7672183573246002,
+      "num_tokens": 726268.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.315368902683258,
+      "epoch": 0.6077981651376146,
+      "grad_norm": 2.1706671714782715,
+      "learning_rate": 9.996890572379418e-05,
+      "loss": 0.6844,
+      "mean_token_accuracy": 0.7582804381847381,
+      "num_tokens": 740230.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2926068663597108,
+      "epoch": 0.6192660550458715,
+      "grad_norm": 1.6460140943527222,
+      "learning_rate": 9.99618601656706e-05,
+      "loss": 0.5693,
+      "mean_token_accuracy": 0.795549190044403,
+      "num_tokens": 754570.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.2848342299461364,
+      "epoch": 0.6307339449541285,
+      "grad_norm": 1.7705565690994263,
+      "learning_rate": 9.995409605647117e-05,
+      "loss": 0.6189,
+      "mean_token_accuracy": 0.7828136622905731,
+      "num_tokens": 768740.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.3091715574264526,
+      "epoch": 0.6422018348623854,
+      "grad_norm": 1.7903367280960083,
+      "learning_rate": 9.994561350785923e-05,
+      "loss": 0.6096,
+      "mean_token_accuracy": 0.7809465050697326,
+      "num_tokens": 782860.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.3097781181335448,
+      "epoch": 0.6536697247706422,
+      "grad_norm": 1.6261135339736938,
+      "learning_rate": 9.993641264183074e-05,
+      "loss": 0.6488,
+      "mean_token_accuracy": 0.7686248242855072,
+      "num_tokens": 796852.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.2892103433609008,
+      "epoch": 0.6651376146788991,
+      "grad_norm": 1.530013084411621,
+      "learning_rate": 9.992649359071247e-05,
+      "loss": 0.6099,
+      "mean_token_accuracy": 0.7832099735736847,
+      "num_tokens": 810833.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.2781771540641784,
+      "epoch": 0.676605504587156,
+      "grad_norm": 1.3513305187225342,
+      "learning_rate": 9.991585649716014e-05,
+      "loss": 0.6059,
+      "mean_token_accuracy": 0.7849724233150482,
+      "num_tokens": 825129.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.289398467540741,
+      "epoch": 0.6880733944954128,
+      "grad_norm": 1.2714006900787354,
+      "learning_rate": 9.990450151415636e-05,
+      "loss": 0.6262,
+      "mean_token_accuracy": 0.7734242856502533,
+      "num_tokens": 839084.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.3282314896583558,
+      "epoch": 0.6995412844036697,
+      "grad_norm": 1.6062265634536743,
+      "learning_rate": 9.989242880500837e-05,
+      "loss": 0.6804,
+      "mean_token_accuracy": 0.7598551273345947,
+      "num_tokens": 853275.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.279460871219635,
+      "epoch": 0.7110091743119266,
+      "grad_norm": 1.211531400680542,
+      "learning_rate": 9.987963854334581e-05,
+      "loss": 0.5422,
+      "mean_token_accuracy": 0.8087258577346802,
+      "num_tokens": 867001.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.3079694390296936,
+      "epoch": 0.7224770642201835,
+      "grad_norm": 1.9886008501052856,
+      "learning_rate": 9.986613091311811e-05,
+      "loss": 0.6505,
+      "mean_token_accuracy": 0.7643534898757934,
+      "num_tokens": 880836.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.3083110094070434,
+      "epoch": 0.7339449541284404,
+      "grad_norm": 1.7378991842269897,
+      "learning_rate": 9.98519061085919e-05,
+      "loss": 0.6507,
+      "mean_token_accuracy": 0.7652741134166717,
+      "num_tokens": 894456.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.3111968874931335,
+      "epoch": 0.7454128440366973,
+      "grad_norm": 1.6157206296920776,
+      "learning_rate": 9.983696433434821e-05,
+      "loss": 0.6009,
+      "mean_token_accuracy": 0.7828308165073394,
+      "num_tokens": 908581.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.3001808285713197,
+      "epoch": 0.7568807339449541,
+      "grad_norm": 1.7530412673950195,
+      "learning_rate": 9.982130580527951e-05,
+      "loss": 0.5973,
+      "mean_token_accuracy": 0.7872715950012207,
+      "num_tokens": 922198.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.3001506924629211,
+      "epoch": 0.768348623853211,
+      "grad_norm": 1.8743090629577637,
+      "learning_rate": 9.980493074658665e-05,
+      "loss": 0.5991,
+      "mean_token_accuracy": 0.7848590850830078,
+      "num_tokens": 934965.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.3329032421112061,
+      "epoch": 0.7798165137614679,
+      "grad_norm": 1.646851658821106,
+      "learning_rate": 9.978783939377558e-05,
+      "loss": 0.646,
+      "mean_token_accuracy": 0.76202232837677,
+      "num_tokens": 949474.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.3042344450950623,
+      "epoch": 0.7912844036697247,
+      "grad_norm": 1.6828117370605469,
+      "learning_rate": 9.9770031992654e-05,
+      "loss": 0.5663,
+      "mean_token_accuracy": 0.7932763636112213,
+      "num_tokens": 963414.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.3154001832008362,
+      "epoch": 0.8027522935779816,
+      "grad_norm": 1.8354583978652954,
+      "learning_rate": 9.975150879932784e-05,
+      "loss": 0.5994,
+      "mean_token_accuracy": 0.7792726159095764,
+      "num_tokens": 977203.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.307938539981842,
+      "epoch": 0.8142201834862385,
+      "grad_norm": 1.6509039402008057,
+      "learning_rate": 9.97322700801975e-05,
+      "loss": 0.5663,
+      "mean_token_accuracy": 0.7955432832241058,
+      "num_tokens": 990943.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.3173952937126159,
+      "epoch": 0.8256880733944955,
+      "grad_norm": 1.8522167205810547,
+      "learning_rate": 9.971231611195407e-05,
+      "loss": 0.614,
+      "mean_token_accuracy": 0.7815097570419312,
+      "num_tokens": 1005001.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.340037202835083,
+      "epoch": 0.8371559633027523,
+      "grad_norm": 1.4919304847717285,
+      "learning_rate": 9.969164718157538e-05,
+      "loss": 0.6348,
+      "mean_token_accuracy": 0.7702794313430786,
+      "num_tokens": 1018544.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.3305164098739624,
+      "epoch": 0.8486238532110092,
+      "grad_norm": 1.5445469617843628,
+      "learning_rate": 9.967026358632184e-05,
+      "loss": 0.6136,
+      "mean_token_accuracy": 0.77325798869133,
+      "num_tokens": 1032665.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.3210863590240478,
+      "epoch": 0.8600917431192661,
+      "grad_norm": 1.9453340768814087,
+      "learning_rate": 9.964816563373212e-05,
+      "loss": 0.6514,
+      "mean_token_accuracy": 0.7692999839782715,
+      "num_tokens": 1047328.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.327096664905548,
+      "epoch": 0.8715596330275229,
+      "grad_norm": 1.8478624820709229,
+      "learning_rate": 9.962535364161879e-05,
+      "loss": 0.6003,
+      "mean_token_accuracy": 0.7799559772014618,
+      "num_tokens": 1061305.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.3272370457649232,
+      "epoch": 0.8830275229357798,
+      "grad_norm": 1.9946807622909546,
+      "learning_rate": 9.960182793806377e-05,
+      "loss": 0.6315,
+      "mean_token_accuracy": 0.7699635088443756,
+      "num_tokens": 1075123.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.3235833764076232,
+      "epoch": 0.8944954128440367,
+      "grad_norm": 1.500209927558899,
+      "learning_rate": 9.957758886141351e-05,
+      "loss": 0.6527,
+      "mean_token_accuracy": 0.7683537185192109,
+      "num_tokens": 1089084.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.312354290485382,
+      "epoch": 0.9059633027522935,
+      "grad_norm": 1.6548733711242676,
+      "learning_rate": 9.955263676027427e-05,
+      "loss": 0.5927,
+      "mean_token_accuracy": 0.7949600100517273,
+      "num_tokens": 1103963.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.3421159029006957,
+      "epoch": 0.9174311926605505,
+      "grad_norm": 1.5262596607208252,
+      "learning_rate": 9.95269719935069e-05,
+      "loss": 0.6553,
+      "mean_token_accuracy": 0.7679201364517212,
+      "num_tokens": 1117901.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.344819176197052,
+      "epoch": 0.9288990825688074,
+      "grad_norm": 1.42953360080719,
+      "learning_rate": 9.950059493022193e-05,
+      "loss": 0.6607,
+      "mean_token_accuracy": 0.762078708410263,
+      "num_tokens": 1132174.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.3429975152015685,
+      "epoch": 0.9403669724770642,
+      "grad_norm": 1.648417592048645,
+      "learning_rate": 9.947350594977402e-05,
+      "loss": 0.6929,
+      "mean_token_accuracy": 0.7437104344367981,
+      "num_tokens": 1146769.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.3269536972045899,
+      "epoch": 0.9518348623853211,
+      "grad_norm": 1.802235722541809,
+      "learning_rate": 9.944570544175673e-05,
+      "loss": 0.6676,
+      "mean_token_accuracy": 0.7601192831993103,
+      "num_tokens": 1161091.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.3191216468811036,
+      "epoch": 0.963302752293578,
+      "grad_norm": 1.9612555503845215,
+      "learning_rate": 9.941719380599672e-05,
+      "loss": 0.625,
+      "mean_token_accuracy": 0.7729354560375213,
+      "num_tokens": 1173905.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.3115869045257569,
+      "epoch": 0.9747706422018348,
+      "grad_norm": 1.2845028638839722,
+      "learning_rate": 9.93879714525481e-05,
+      "loss": 0.5944,
+      "mean_token_accuracy": 0.7839926242828369,
+      "num_tokens": 1188063.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.3091205954551697,
+      "epoch": 0.9862385321100917,
+      "grad_norm": 1.8383289575576782,
+      "learning_rate": 9.935803880168652e-05,
+      "loss": 0.6237,
+      "mean_token_accuracy": 0.7753754138946534,
+      "num_tokens": 1202695.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.2994250178337097,
+      "epoch": 0.9977064220183486,
+      "grad_norm": 1.571912407875061,
+      "learning_rate": 9.932739628390316e-05,
+      "loss": 0.6456,
+      "mean_token_accuracy": 0.7671150684356689,
+      "num_tokens": 1216684.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.3076510548591613,
+      "epoch": 1.0091743119266054,
+      "grad_norm": 1.8406661748886108,
+      "learning_rate": 9.929604433989843e-05,
+      "loss": 0.6445,
+      "mean_token_accuracy": 0.7758039116859436,
+      "num_tokens": 1229248.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.2624098420143128,
+      "epoch": 1.0206422018348624,
+      "grad_norm": 1.9808402061462402,
+      "learning_rate": 9.926398342057577e-05,
+      "loss": 0.492,
+      "mean_token_accuracy": 0.8236800074577332,
+      "num_tokens": 1243088.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.252714467048645,
+      "epoch": 1.0321100917431192,
+      "grad_norm": 2.2568917274475098,
+      "learning_rate": 9.923121398703504e-05,
+      "loss": 0.4861,
+      "mean_token_accuracy": 0.8282331109046936,
+      "num_tokens": 1256681.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.2762907862663269,
+      "epoch": 1.0435779816513762,
+      "grad_norm": 1.7591499090194702,
+      "learning_rate": 9.9197736510566e-05,
+      "loss": 0.5326,
+      "mean_token_accuracy": 0.8061232268810272,
+      "num_tokens": 1270563.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.2779451608657837,
+      "epoch": 1.0550458715596331,
+      "grad_norm": 1.7618857622146606,
+      "learning_rate": 9.916355147264142e-05,
+      "loss": 0.5762,
+      "mean_token_accuracy": 0.7888909459114075,
+      "num_tokens": 1284789.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.3000144004821776,
+      "epoch": 1.06651376146789,
+      "grad_norm": 1.929226040840149,
+      "learning_rate": 9.912865936491026e-05,
+      "loss": 0.556,
+      "mean_token_accuracy": 0.7985962986946106,
+      "num_tokens": 1298314.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.2920597314834594,
+      "epoch": 1.0779816513761469,
+      "grad_norm": 2.1356875896453857,
+      "learning_rate": 9.909306068919055e-05,
+      "loss": 0.5872,
+      "mean_token_accuracy": 0.7914662003517151,
+      "num_tokens": 1312524.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.3042231440544128,
+      "epoch": 1.0894495412844036,
+      "grad_norm": 2.148797035217285,
+      "learning_rate": 9.905675595746215e-05,
+      "loss": 0.5507,
+      "mean_token_accuracy": 0.802655827999115,
+      "num_tokens": 1326952.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.277776312828064,
+      "epoch": 1.1009174311926606,
+      "grad_norm": 1.6280494928359985,
+      "learning_rate": 9.901974569185941e-05,
+      "loss": 0.5579,
+      "mean_token_accuracy": 0.8001268386840821,
+      "num_tokens": 1341302.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.2962275981903075,
+      "epoch": 1.1123853211009174,
+      "grad_norm": 1.8065513372421265,
+      "learning_rate": 9.898203042466368e-05,
+      "loss": 0.5492,
+      "mean_token_accuracy": 0.8058996260166168,
+      "num_tokens": 1355689.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.2893213629722595,
+      "epoch": 1.1238532110091743,
+      "grad_norm": 1.864761233329773,
+      "learning_rate": 9.894361069829565e-05,
+      "loss": 0.5292,
+      "mean_token_accuracy": 0.8077204465866089,
+      "num_tokens": 1369850.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.2918407797813416,
+      "epoch": 1.135321100917431,
+      "grad_norm": 2.276775598526001,
+      "learning_rate": 9.89044870653075e-05,
+      "loss": 0.564,
+      "mean_token_accuracy": 0.7952383041381836,
+      "num_tokens": 1384054.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.281248104572296,
+      "epoch": 1.146788990825688,
+      "grad_norm": 2.1157305240631104,
+      "learning_rate": 9.886466008837503e-05,
+      "loss": 0.5706,
+      "mean_token_accuracy": 0.7949798464775085,
+      "num_tokens": 1398492.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.2710728526115418,
+      "epoch": 1.158256880733945,
+      "grad_norm": 1.8817031383514404,
+      "learning_rate": 9.882413034028948e-05,
+      "loss": 0.516,
+      "mean_token_accuracy": 0.8137441635131836,
+      "num_tokens": 1412100.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.2870657205581666,
+      "epoch": 1.1697247706422018,
+      "grad_norm": 1.7975279092788696,
+      "learning_rate": 9.878289840394938e-05,
+      "loss": 0.5374,
+      "mean_token_accuracy": 0.8032542705535889,
+      "num_tokens": 1425770.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.2666459918022155,
+      "epoch": 1.1811926605504588,
+      "grad_norm": 2.47218656539917,
+      "learning_rate": 9.874096487235212e-05,
+      "loss": 0.5158,
+      "mean_token_accuracy": 0.8173266768455505,
+      "num_tokens": 1439309.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.3137032628059386,
+      "epoch": 1.1926605504587156,
+      "grad_norm": 1.7813074588775635,
+      "learning_rate": 9.869833034858538e-05,
+      "loss": 0.5324,
+      "mean_token_accuracy": 0.8099446773529053,
+      "num_tokens": 1454541.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.2864318251609803,
+      "epoch": 1.2041284403669725,
+      "grad_norm": 1.9276366233825684,
+      "learning_rate": 9.86549954458186e-05,
+      "loss": 0.5554,
+      "mean_token_accuracy": 0.8048118472099304,
+      "num_tokens": 1468346.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.2949382424354554,
+      "epoch": 1.2155963302752293,
+      "grad_norm": 1.9171100854873657,
+      "learning_rate": 9.861096078729396e-05,
+      "loss": 0.5857,
+      "mean_token_accuracy": 0.7923648238182068,
+      "num_tokens": 1482839.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.2825786828994752,
+      "epoch": 1.2270642201834863,
+      "grad_norm": 1.458295226097107,
+      "learning_rate": 9.85662270063176e-05,
+      "loss": 0.5344,
+      "mean_token_accuracy": 0.8081244885921478,
+      "num_tokens": 1496532.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.2934918642044066,
+      "epoch": 1.238532110091743,
+      "grad_norm": 2.2048583030700684,
+      "learning_rate": 9.852079474625035e-05,
+      "loss": 0.5802,
+      "mean_token_accuracy": 0.7943230092525482,
+      "num_tokens": 1510406.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.3103590607643127,
+      "epoch": 1.25,
+      "grad_norm": 2.103316307067871,
+      "learning_rate": 9.847466466049868e-05,
+      "loss": 0.5761,
+      "mean_token_accuracy": 0.7919000566005707,
+      "num_tokens": 1524582.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.2943686366081237,
+      "epoch": 1.261467889908257,
+      "grad_norm": 1.8935585021972656,
+      "learning_rate": 9.84278374125051e-05,
+      "loss": 0.5668,
+      "mean_token_accuracy": 0.795119684934616,
+      "num_tokens": 1538645.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.2833523988723754,
+      "epoch": 1.2729357798165137,
+      "grad_norm": 1.5310587882995605,
+      "learning_rate": 9.838031367573868e-05,
+      "loss": 0.4791,
+      "mean_token_accuracy": 0.8290136575698852,
+      "num_tokens": 1552198.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.2810697436332703,
+      "epoch": 1.2844036697247707,
+      "grad_norm": 1.9493242502212524,
+      "learning_rate": 9.833209413368546e-05,
+      "loss": 0.5479,
+      "mean_token_accuracy": 0.7984305679798126,
+      "num_tokens": 1566248.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.2971422672271729,
+      "epoch": 1.2958715596330275,
+      "grad_norm": 2.143052816390991,
+      "learning_rate": 9.828317947983851e-05,
+      "loss": 0.5556,
+      "mean_token_accuracy": 0.7962001860141754,
+      "num_tokens": 1579657.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.2938915967941285,
+      "epoch": 1.3073394495412844,
+      "grad_norm": 3.074519395828247,
+      "learning_rate": 9.823357041768797e-05,
+      "loss": 0.5808,
+      "mean_token_accuracy": 0.7921633243560791,
+      "num_tokens": 1594362.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.3013799428939818,
+      "epoch": 1.3188073394495412,
+      "grad_norm": 2.1249051094055176,
+      "learning_rate": 9.8183267660711e-05,
+      "loss": 0.5679,
+      "mean_token_accuracy": 0.7960763275623322,
+      "num_tokens": 1607995.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.2755417585372926,
+      "epoch": 1.3302752293577982,
+      "grad_norm": 1.7334320545196533,
+      "learning_rate": 9.813227193236144e-05,
+      "loss": 0.5211,
+      "mean_token_accuracy": 0.8171180784702301,
+      "num_tokens": 1621183.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.300136685371399,
+      "epoch": 1.341743119266055,
+      "grad_norm": 1.604264259338379,
+      "learning_rate": 9.808058396605945e-05,
+      "loss": 0.5622,
+      "mean_token_accuracy": 0.7956745982170105,
+      "num_tokens": 1634961.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.2956653475761413,
+      "epoch": 1.353211009174312,
+      "grad_norm": 2.304135322570801,
+      "learning_rate": 9.802820450518095e-05,
+      "loss": 0.5919,
+      "mean_token_accuracy": 0.7799835622310638,
+      "num_tokens": 1648959.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.3270721554756164,
+      "epoch": 1.364678899082569,
+      "grad_norm": 2.304185390472412,
+      "learning_rate": 9.797513430304695e-05,
+      "loss": 0.6347,
+      "mean_token_accuracy": 0.7729239940643311,
+      "num_tokens": 1662218.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.3200181603431702,
+      "epoch": 1.3761467889908257,
+      "grad_norm": 2.673722743988037,
+      "learning_rate": 9.792137412291265e-05,
+      "loss": 0.6568,
+      "mean_token_accuracy": 0.7654553771018981,
+      "num_tokens": 1675320.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.3001809120178223,
+      "epoch": 1.3876146788990826,
+      "grad_norm": 1.8785172700881958,
+      "learning_rate": 9.786692473795654e-05,
+      "loss": 0.5498,
+      "mean_token_accuracy": 0.7971892893314362,
+      "num_tokens": 1688732.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.2927094459533692,
+      "epoch": 1.3990825688073394,
+      "grad_norm": 2.299051284790039,
+      "learning_rate": 9.781178693126923e-05,
+      "loss": 0.5317,
+      "mean_token_accuracy": 0.812885046005249,
+      "num_tokens": 1702489.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.2940443515777589,
+      "epoch": 1.4105504587155964,
+      "grad_norm": 2.107447385787964,
+      "learning_rate": 9.775596149584226e-05,
+      "loss": 0.5408,
+      "mean_token_accuracy": 0.8026755452156067,
+      "num_tokens": 1717066.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.2880491733551025,
+      "epoch": 1.4220183486238533,
+      "grad_norm": 2.120649814605713,
+      "learning_rate": 9.769944923455654e-05,
+      "loss": 0.5122,
+      "mean_token_accuracy": 0.8185527265071869,
+      "num_tokens": 1730503.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.2935888648033143,
+      "epoch": 1.43348623853211,
+      "grad_norm": 1.8897229433059692,
+      "learning_rate": 9.764225096017102e-05,
+      "loss": 0.5891,
+      "mean_token_accuracy": 0.7794159233570099,
+      "num_tokens": 1744257.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.2713160991668702,
+      "epoch": 1.4449541284403669,
+      "grad_norm": 1.9189554452896118,
+      "learning_rate": 9.758436749531079e-05,
+      "loss": 0.5146,
+      "mean_token_accuracy": 0.818141633272171,
+      "num_tokens": 1758267.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.2798304796218871,
+      "epoch": 1.4564220183486238,
+      "grad_norm": 2.2521767616271973,
+      "learning_rate": 9.752579967245538e-05,
+      "loss": 0.5959,
+      "mean_token_accuracy": 0.7902258694171905,
+      "num_tokens": 1771990.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.296580719947815,
+      "epoch": 1.4678899082568808,
+      "grad_norm": 1.5478334426879883,
+      "learning_rate": 9.746654833392677e-05,
+      "loss": 0.5636,
+      "mean_token_accuracy": 0.8009288847446442,
+      "num_tokens": 1786045.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.2467906951904297,
+      "epoch": 1.4793577981651376,
+      "grad_norm": 1.8531265258789062,
+      "learning_rate": 9.740661433187725e-05,
+      "loss": 0.4514,
+      "mean_token_accuracy": 0.8369600057601929,
+      "num_tokens": 1800019.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.2813060760498047,
+      "epoch": 1.4908256880733946,
+      "grad_norm": 2.007786512374878,
+      "learning_rate": 9.734599852827712e-05,
+      "loss": 0.5587,
+      "mean_token_accuracy": 0.8045243263244629,
+      "num_tokens": 1814394.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.2923226833343506,
+      "epoch": 1.5022935779816513,
+      "grad_norm": 2.0562584400177,
+      "learning_rate": 9.728470179490244e-05,
+      "loss": 0.563,
+      "mean_token_accuracy": 0.79967080950737,
+      "num_tokens": 1827604.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.28248028755188,
+      "epoch": 1.5137614678899083,
+      "grad_norm": 1.8021918535232544,
+      "learning_rate": 9.72227250133223e-05,
+      "loss": 0.5535,
+      "mean_token_accuracy": 0.8028985977172851,
+      "num_tokens": 1841751.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.2800176739692688,
+      "epoch": 1.5252293577981653,
+      "grad_norm": 2.0901622772216797,
+      "learning_rate": 9.71600690748863e-05,
+      "loss": 0.5889,
+      "mean_token_accuracy": 0.7968101024627685,
+      "num_tokens": 1856403.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.2775539755821228,
+      "epoch": 1.536697247706422,
+      "grad_norm": 1.9024734497070312,
+      "learning_rate": 9.709673488071163e-05,
+      "loss": 0.5529,
+      "mean_token_accuracy": 0.7998219549655914,
+      "num_tokens": 1870952.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.3066880822181701,
+      "epoch": 1.5481651376146788,
+      "grad_norm": 2.2026913166046143,
+      "learning_rate": 9.70327233416702e-05,
+      "loss": 0.6146,
+      "mean_token_accuracy": 0.7799036145210266,
+      "num_tokens": 1884850.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.2854471683502198,
+      "epoch": 1.5596330275229358,
+      "grad_norm": 1.995058298110962,
+      "learning_rate": 9.696803537837542e-05,
+      "loss": 0.5744,
+      "mean_token_accuracy": 0.7955298364162445,
+      "num_tokens": 1898895.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.2856696963310241,
+      "epoch": 1.5711009174311927,
+      "grad_norm": 1.913603663444519,
+      "learning_rate": 9.690267192116908e-05,
+      "loss": 0.525,
+      "mean_token_accuracy": 0.8169679343700409,
+      "num_tokens": 1913026.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.3183680534362794,
+      "epoch": 1.5825688073394495,
+      "grad_norm": 2.7248916625976562,
+      "learning_rate": 9.683663391010791e-05,
+      "loss": 0.6482,
+      "mean_token_accuracy": 0.7678777754306794,
+      "num_tokens": 1927053.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.298743522167206,
+      "epoch": 1.5940366972477065,
+      "grad_norm": 2.011831521987915,
+      "learning_rate": 9.676992229495004e-05,
+      "loss": 0.577,
+      "mean_token_accuracy": 0.7876397609710694,
+      "num_tokens": 1940596.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.294689130783081,
+      "epoch": 1.6055045871559632,
+      "grad_norm": 2.2598249912261963,
+      "learning_rate": 9.670253803514142e-05,
+      "loss": 0.5746,
+      "mean_token_accuracy": 0.7938637971878052,
+      "num_tokens": 1955635.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.3118200659751893,
+      "epoch": 1.6169724770642202,
+      "grad_norm": 1.9109872579574585,
+      "learning_rate": 9.66344820998019e-05,
+      "loss": 0.5996,
+      "mean_token_accuracy": 0.7869695067405701,
+      "num_tokens": 1970187.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.2969690084457397,
+      "epoch": 1.6284403669724772,
+      "grad_norm": 2.021652936935425,
+      "learning_rate": 9.656575546771144e-05,
+      "loss": 0.5692,
+      "mean_token_accuracy": 0.7921172618865967,
+      "num_tokens": 1983963.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.3053216218948365,
+      "epoch": 1.639908256880734,
+      "grad_norm": 2.056626081466675,
+      "learning_rate": 9.649635912729589e-05,
+      "loss": 0.5534,
+      "mean_token_accuracy": 0.7994763553142548,
+      "num_tokens": 1997426.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.307614517211914,
+      "epoch": 1.6513761467889907,
+      "grad_norm": 2.0294957160949707,
+      "learning_rate": 9.642629407661288e-05,
+      "loss": 0.6113,
+      "mean_token_accuracy": 0.7812033116817474,
+      "num_tokens": 2011810.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.2840725421905517,
+      "epoch": 1.6628440366972477,
+      "grad_norm": 2.376054525375366,
+      "learning_rate": 9.63555613233374e-05,
+      "loss": 0.5333,
+      "mean_token_accuracy": 0.8069488048553467,
+      "num_tokens": 2025702.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.2848711609840393,
+      "epoch": 1.6743119266055047,
+      "grad_norm": 2.387098550796509,
+      "learning_rate": 9.628416188474735e-05,
+      "loss": 0.5295,
+      "mean_token_accuracy": 0.8113990724086761,
+      "num_tokens": 2040039.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.3038938522338868,
+      "epoch": 1.6857798165137616,
+      "grad_norm": 2.6049790382385254,
+      "learning_rate": 9.621209678770889e-05,
+      "loss": 0.5902,
+      "mean_token_accuracy": 0.7839356422424316,
+      "num_tokens": 2054883.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.3001854181289674,
+      "epoch": 1.6972477064220184,
+      "grad_norm": 2.08150577545166,
+      "learning_rate": 9.613936706866168e-05,
+      "loss": 0.5804,
+      "mean_token_accuracy": 0.7912817001342773,
+      "num_tokens": 2068892.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.2911452770233154,
+      "epoch": 1.7087155963302751,
+      "grad_norm": 2.2386717796325684,
+      "learning_rate": 9.606597377360396e-05,
+      "loss": 0.5902,
+      "mean_token_accuracy": 0.7858116149902343,
+      "num_tokens": 2083075.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.2923203349113463,
+      "epoch": 1.7201834862385321,
+      "grad_norm": 1.9360357522964478,
+      "learning_rate": 9.59919179580775e-05,
+      "loss": 0.5931,
+      "mean_token_accuracy": 0.7880455732345581,
+      "num_tokens": 2097088.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.2811247110366821,
+      "epoch": 1.731651376146789,
+      "grad_norm": 2.346832275390625,
+      "learning_rate": 9.591720068715247e-05,
+      "loss": 0.5381,
+      "mean_token_accuracy": 0.8110429465770721,
+      "num_tokens": 2110713.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.2997817516326904,
+      "epoch": 1.7431192660550459,
+      "grad_norm": 2.1013338565826416,
+      "learning_rate": 9.584182303541205e-05,
+      "loss": 0.5771,
+      "mean_token_accuracy": 0.7898500382900238,
+      "num_tokens": 2124467.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.283075988292694,
+      "epoch": 1.7545871559633026,
+      "grad_norm": 1.718410849571228,
+      "learning_rate": 9.576578608693703e-05,
+      "loss": 0.5545,
+      "mean_token_accuracy": 0.8036096036434174,
+      "num_tokens": 2139017.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.2541950225830079,
+      "epoch": 1.7660550458715596,
+      "grad_norm": 2.381345510482788,
+      "learning_rate": 9.568909093529022e-05,
+      "loss": 0.5071,
+      "mean_token_accuracy": 0.8172869801521301,
+      "num_tokens": 2153212.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.2600136041641234,
+      "epoch": 1.7775229357798166,
+      "grad_norm": 1.9568657875061035,
+      "learning_rate": 9.561173868350067e-05,
+      "loss": 0.5251,
+      "mean_token_accuracy": 0.8089884519577026,
+      "num_tokens": 2167190.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.2688735485076905,
+      "epoch": 1.7889908256880735,
+      "grad_norm": 2.0126872062683105,
+      "learning_rate": 9.553373044404783e-05,
+      "loss": 0.5563,
+      "mean_token_accuracy": 0.8013049483299255,
+      "num_tokens": 2181135.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.2632331728935242,
+      "epoch": 1.8004587155963303,
+      "grad_norm": 1.7177560329437256,
+      "learning_rate": 9.54550673388456e-05,
+      "loss": 0.5456,
+      "mean_token_accuracy": 0.8039442837238312,
+      "num_tokens": 2195099.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.2656291127204895,
+      "epoch": 1.811926605504587,
+      "grad_norm": 2.6126630306243896,
+      "learning_rate": 9.537575049922613e-05,
+      "loss": 0.5516,
+      "mean_token_accuracy": 0.7961392283439637,
+      "num_tokens": 2209220.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.278434193134308,
+      "epoch": 1.823394495412844,
+      "grad_norm": 2.216356039047241,
+      "learning_rate": 9.52957810659236e-05,
+      "loss": 0.548,
+      "mean_token_accuracy": 0.7977044761180878,
+      "num_tokens": 2222873.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.285041868686676,
+      "epoch": 1.834862385321101,
+      "grad_norm": 2.2278988361358643,
+      "learning_rate": 9.521516018905771e-05,
+      "loss": 0.5905,
+      "mean_token_accuracy": 0.7802383601665497,
+      "num_tokens": 2237054.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.2938857316970824,
+      "epoch": 1.8463302752293578,
+      "grad_norm": 2.0378856658935547,
+      "learning_rate": 9.513388902811733e-05,
+      "loss": 0.6033,
+      "mean_token_accuracy": 0.7891092479228974,
+      "num_tokens": 2250581.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.2730875372886659,
+      "epoch": 1.8577981651376145,
+      "grad_norm": 1.9576410055160522,
+      "learning_rate": 9.505196875194362e-05,
+      "loss": 0.5709,
+      "mean_token_accuracy": 0.7948619246482849,
+      "num_tokens": 2264352.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.2942588448524475,
+      "epoch": 1.8692660550458715,
+      "grad_norm": 3.2486989498138428,
+      "learning_rate": 9.496940053871333e-05,
+      "loss": 0.5695,
+      "mean_token_accuracy": 0.7931654870510101,
+      "num_tokens": 2278395.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.2859179735183717,
+      "epoch": 1.8807339449541285,
+      "grad_norm": 1.7161357402801514,
+      "learning_rate": 9.488618557592187e-05,
+      "loss": 0.5588,
+      "mean_token_accuracy": 0.7988445639610291,
+      "num_tokens": 2292458.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.287862777709961,
+      "epoch": 1.8922018348623855,
+      "grad_norm": 1.7279341220855713,
+      "learning_rate": 9.480232506036618e-05,
+      "loss": 0.5718,
+      "mean_token_accuracy": 0.7963582694530487,
+      "num_tokens": 2305950.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.2868569016456604,
+      "epoch": 1.9036697247706422,
+      "grad_norm": 1.7532700300216675,
+      "learning_rate": 9.471782019812748e-05,
+      "loss": 0.5739,
+      "mean_token_accuracy": 0.7951330602169037,
+      "num_tokens": 2320092.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.2931817889213562,
+      "epoch": 1.915137614678899,
+      "grad_norm": 2.7232377529144287,
+      "learning_rate": 9.463267220455408e-05,
+      "loss": 0.5996,
+      "mean_token_accuracy": 0.7812487840652466,
+      "num_tokens": 2334035.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.2847351789474488,
+      "epoch": 1.926605504587156,
+      "grad_norm": 2.1023809909820557,
+      "learning_rate": 9.454688230424372e-05,
+      "loss": 0.5516,
+      "mean_token_accuracy": 0.8027086973190307,
+      "num_tokens": 2348205.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.2944233417510986,
+      "epoch": 1.938073394495413,
+      "grad_norm": 2.789158582687378,
+      "learning_rate": 9.446045173102607e-05,
+      "loss": 0.6096,
+      "mean_token_accuracy": 0.7904924273490905,
+      "num_tokens": 2362411.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.296918225288391,
+      "epoch": 1.9495412844036697,
+      "grad_norm": 2.8648757934570312,
+      "learning_rate": 9.437338172794495e-05,
+      "loss": 0.5851,
+      "mean_token_accuracy": 0.7826291382312774,
+      "num_tokens": 2376229.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.2416040658950807,
+      "epoch": 1.9610091743119265,
+      "grad_norm": 2.146327257156372,
+      "learning_rate": 9.428567354724047e-05,
+      "loss": 0.5003,
+      "mean_token_accuracy": 0.8209156513214111,
+      "num_tokens": 2389870.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.299118459224701,
+      "epoch": 1.9724770642201834,
+      "grad_norm": 1.9699536561965942,
+      "learning_rate": 9.419732845033093e-05,
+      "loss": 0.5857,
+      "mean_token_accuracy": 0.7884073138237,
+      "num_tokens": 2403887.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.307542335987091,
+      "epoch": 1.9839449541284404,
+      "grad_norm": 2.541121006011963,
+      "learning_rate": 9.410834770779489e-05,
+      "loss": 0.6299,
+      "mean_token_accuracy": 0.7736253619194031,
+      "num_tokens": 2418109.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.2949981808662414,
+      "epoch": 1.9954128440366974,
+      "grad_norm": 1.7402102947235107,
+      "learning_rate": 9.401873259935261e-05,
+      "loss": 0.5928,
+      "mean_token_accuracy": 0.7905942320823669,
+      "num_tokens": 2432561.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.2499936938285827,
+      "epoch": 2.006880733944954,
+      "grad_norm": 1.9419931173324585,
+      "learning_rate": 9.392848441384791e-05,
+      "loss": 0.4459,
+      "mean_token_accuracy": 0.8404906570911408,
+      "num_tokens": 2445642.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.2266974091529845,
+      "epoch": 2.018348623853211,
+      "grad_norm": 2.0387706756591797,
+      "learning_rate": 9.383760444922948e-05,
+      "loss": 0.4638,
+      "mean_token_accuracy": 0.837564754486084,
+      "num_tokens": 2459784.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.236153519153595,
+      "epoch": 2.029816513761468,
+      "grad_norm": 3.5168395042419434,
+      "learning_rate": 9.374609401253222e-05,
+      "loss": 0.4331,
+      "mean_token_accuracy": 0.8434469997882843,
+      "num_tokens": 2473618.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.2012577056884766,
+      "epoch": 2.041284403669725,
+      "grad_norm": 2.028303384780884,
+      "learning_rate": 9.365395441985854e-05,
+      "loss": 0.4092,
+      "mean_token_accuracy": 0.8543282926082612,
+      "num_tokens": 2487897.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.2243779063224793,
+      "epoch": 2.052752293577982,
+      "grad_norm": 2.172203779220581,
+      "learning_rate": 9.35611869963593e-05,
+      "loss": 0.4359,
+      "mean_token_accuracy": 0.8463135242462159,
+      "num_tokens": 2501905.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.2188843488693237,
+      "epoch": 2.0642201834862384,
+      "grad_norm": 2.771411657333374,
+      "learning_rate": 9.346779307621485e-05,
+      "loss": 0.4237,
+      "mean_token_accuracy": 0.847892826795578,
+      "num_tokens": 2516177.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.2115617513656616,
+      "epoch": 2.0756880733944953,
+      "grad_norm": 3.289663314819336,
+      "learning_rate": 9.33737740026158e-05,
+      "loss": 0.4571,
+      "mean_token_accuracy": 0.8364902794361114,
+      "num_tokens": 2529738.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.215058648586273,
+      "epoch": 2.0871559633027523,
+      "grad_norm": 2.3567795753479004,
+      "learning_rate": 9.327913112774375e-05,
+      "loss": 0.3728,
+      "mean_token_accuracy": 0.8703641653060913,
+      "num_tokens": 2544060.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.1677044749259948,
+      "epoch": 2.0986238532110093,
+      "grad_norm": 3.139902114868164,
+      "learning_rate": 9.318386581275175e-05,
+      "loss": 0.393,
+      "mean_token_accuracy": 0.8582305371761322,
+      "num_tokens": 2558412.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.205257821083069,
+      "epoch": 2.1100917431192663,
+      "grad_norm": 2.8340916633605957,
+      "learning_rate": 9.308797942774481e-05,
+      "loss": 0.4543,
+      "mean_token_accuracy": 0.8368471086025238,
+      "num_tokens": 2571273.0,
+      "step": 1840
+    },
+    {
+      "entropy": 1.2078231811523437,
+      "epoch": 2.121559633027523,
+      "grad_norm": 2.454955577850342,
+      "learning_rate": 9.299147335176018e-05,
+      "loss": 0.4406,
+      "mean_token_accuracy": 0.8422400116920471,
+      "num_tokens": 2585553.0,
+      "step": 1850
+    },
+    {
+      "entropy": 1.192740023136139,
+      "epoch": 2.13302752293578,
+      "grad_norm": 3.1895253658294678,
+      "learning_rate": 9.289434897274742e-05,
+      "loss": 0.3898,
+      "mean_token_accuracy": 0.8690705955028534,
+      "num_tokens": 2599421.0,
+      "step": 1860
+    },
+    {
+      "entropy": 1.2215320587158203,
+      "epoch": 2.1444954128440368,
+      "grad_norm": 2.7647485733032227,
+      "learning_rate": 9.279660768754863e-05,
+      "loss": 0.4149,
+      "mean_token_accuracy": 0.8522852241992951,
+      "num_tokens": 2612894.0,
+      "step": 1870
+    },
+    {
+      "entropy": 1.226225733757019,
+      "epoch": 2.1559633027522938,
+      "grad_norm": 2.4062561988830566,
+      "learning_rate": 9.269825090187818e-05,
+      "loss": 0.4611,
+      "mean_token_accuracy": 0.8352656781673431,
+      "num_tokens": 2626813.0,
+      "step": 1880
+    },
+    {
+      "entropy": 1.2242176175117492,
+      "epoch": 2.1674311926605503,
+      "grad_norm": 3.1008799076080322,
+      "learning_rate": 9.259928003030259e-05,
+      "loss": 0.455,
+      "mean_token_accuracy": 0.8366096138954162,
+      "num_tokens": 2641120.0,
+      "step": 1890
+    },
+    {
+      "entropy": 1.1906989932060241,
+      "epoch": 2.1788990825688073,
+      "grad_norm": 2.4737274646759033,
+      "learning_rate": 9.249969649622012e-05,
+      "loss": 0.4102,
+      "mean_token_accuracy": 0.8577539443969726,
+      "num_tokens": 2655355.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.2102949023246765,
+      "epoch": 2.1903669724770642,
+      "grad_norm": 2.7570645809173584,
+      "learning_rate": 9.239950173184038e-05,
+      "loss": 0.4653,
+      "mean_token_accuracy": 0.8341022551059722,
+      "num_tokens": 2669546.0,
+      "step": 1910
+    },
+    {
+      "entropy": 1.2431818008422852,
+      "epoch": 2.2018348623853212,
+      "grad_norm": 2.4581117630004883,
+      "learning_rate": 9.229869717816369e-05,
+      "loss": 0.4823,
+      "mean_token_accuracy": 0.8271047711372376,
+      "num_tokens": 2684043.0,
+      "step": 1920
+    },
+    {
+      "entropy": 1.205946135520935,
+      "epoch": 2.213302752293578,
+      "grad_norm": 2.6330184936523438,
+      "learning_rate": 9.219728428496033e-05,
+      "loss": 0.4137,
+      "mean_token_accuracy": 0.8539348840713501,
+      "num_tokens": 2698376.0,
+      "step": 1930
+    },
+    {
+      "entropy": 1.1894460320472717,
+      "epoch": 2.2247706422018347,
+      "grad_norm": 2.838942527770996,
+      "learning_rate": 9.209526451074971e-05,
+      "loss": 0.4026,
+      "mean_token_accuracy": 0.8516385197639466,
+      "num_tokens": 2712547.0,
+      "step": 1940
+    },
+    {
+      "entropy": 1.2164816498756408,
+      "epoch": 2.2362385321100917,
+      "grad_norm": 2.2571582794189453,
+      "learning_rate": 9.199263932277945e-05,
+      "loss": 0.4471,
+      "mean_token_accuracy": 0.8385171294212341,
+      "num_tokens": 2726604.0,
+      "step": 1950
+    },
+    {
+      "entropy": 1.2147113919258117,
+      "epoch": 2.2477064220183487,
+      "grad_norm": 2.4930830001831055,
+      "learning_rate": 9.188941019700413e-05,
+      "loss": 0.4372,
+      "mean_token_accuracy": 0.8448963344097138,
+      "num_tokens": 2740678.0,
+      "step": 1960
+    },
+    {
+      "entropy": 1.210788643360138,
+      "epoch": 2.2591743119266057,
+      "grad_norm": 2.27130126953125,
+      "learning_rate": 9.178557861806427e-05,
+      "loss": 0.4263,
+      "mean_token_accuracy": 0.8476684868335724,
+      "num_tokens": 2755318.0,
+      "step": 1970
+    },
+    {
+      "entropy": 1.2224658489227296,
+      "epoch": 2.270642201834862,
+      "grad_norm": 2.7551164627075195,
+      "learning_rate": 9.168114607926478e-05,
+      "loss": 0.4593,
+      "mean_token_accuracy": 0.8373873710632325,
+      "num_tokens": 2769370.0,
+      "step": 1980
+    },
+    {
+      "entropy": 1.1999244093894958,
+      "epoch": 2.282110091743119,
+      "grad_norm": 2.5176587104797363,
+      "learning_rate": 9.157611408255362e-05,
+      "loss": 0.4034,
+      "mean_token_accuracy": 0.8577793180942536,
+      "num_tokens": 2783032.0,
+      "step": 1990
+    },
+    {
+      "entropy": 1.2038641333580018,
+      "epoch": 2.293577981651376,
+      "grad_norm": 2.9991376399993896,
+      "learning_rate": 9.147048413850013e-05,
+      "loss": 0.4301,
+      "mean_token_accuracy": 0.8453950345516205,
+      "num_tokens": 2796767.0,
+      "step": 2000
+    },
+    {
+      "entropy": 1.1978623747825623,
+      "epoch": 2.305045871559633,
+      "grad_norm": 3.3090035915374756,
+      "learning_rate": 9.136425776627332e-05,
+      "loss": 0.4478,
+      "mean_token_accuracy": 0.8409509658813477,
+      "num_tokens": 2811040.0,
+      "step": 2010
+    },
+    {
+      "entropy": 1.1866759181022644,
+      "epoch": 2.31651376146789,
+      "grad_norm": 2.55146861076355,
+      "learning_rate": 9.125743649362004e-05,
+      "loss": 0.4111,
+      "mean_token_accuracy": 0.8524983108043671,
+      "num_tokens": 2824974.0,
+      "step": 2020
+    },
+    {
+      "entropy": 1.1859935998916626,
+      "epoch": 2.3279816513761467,
+      "grad_norm": 3.0141847133636475,
+      "learning_rate": 9.115002185684298e-05,
+      "loss": 0.3853,
+      "mean_token_accuracy": 0.8653997778892517,
+      "num_tokens": 2839045.0,
+      "step": 2030
+    },
+    {
+      "entropy": 1.200369417667389,
+      "epoch": 2.3394495412844036,
+      "grad_norm": 3.238649845123291,
+      "learning_rate": 9.104201540077857e-05,
+      "loss": 0.4367,
+      "mean_token_accuracy": 0.849113005399704,
+      "num_tokens": 2852630.0,
+      "step": 2040
+    },
+    {
+      "entropy": 1.1820636987686157,
+      "epoch": 2.3509174311926606,
+      "grad_norm": 2.7103121280670166,
+      "learning_rate": 9.093341867877485e-05,
+      "loss": 0.4002,
+      "mean_token_accuracy": 0.8553554117679596,
+      "num_tokens": 2866122.0,
+      "step": 2050
+    },
+    {
+      "entropy": 1.1925541877746582,
+      "epoch": 2.3623853211009176,
+      "grad_norm": 2.6658740043640137,
+      "learning_rate": 9.082423325266898e-05,
+      "loss": 0.3514,
+      "mean_token_accuracy": 0.8793700635433197,
+      "num_tokens": 2879896.0,
+      "step": 2060
+    },
+    {
+      "entropy": 1.2116564989089966,
+      "epoch": 2.373853211009174,
+      "grad_norm": 2.671292304992676,
+      "learning_rate": 9.071446069276487e-05,
+      "loss": 0.4465,
+      "mean_token_accuracy": 0.8398383617401123,
+      "num_tokens": 2893860.0,
+      "step": 2070
+    },
+    {
+      "entropy": 1.188430666923523,
+      "epoch": 2.385321100917431,
+      "grad_norm": 2.8591768741607666,
+      "learning_rate": 9.060410257781067e-05,
+      "loss": 0.4051,
+      "mean_token_accuracy": 0.8556796789169312,
+      "num_tokens": 2907637.0,
+      "step": 2080
+    },
+    {
+      "entropy": 1.2041984677314759,
+      "epoch": 2.396788990825688,
+      "grad_norm": 4.1422858238220215,
+      "learning_rate": 9.049316049497587e-05,
+      "loss": 0.4237,
+      "mean_token_accuracy": 0.8538813769817353,
+      "num_tokens": 2921138.0,
+      "step": 2090
+    },
+    {
+      "entropy": 1.2061491370201112,
+      "epoch": 2.408256880733945,
+      "grad_norm": 3.6644909381866455,
+      "learning_rate": 9.038163603982861e-05,
+      "loss": 0.4718,
+      "mean_token_accuracy": 0.834813779592514,
+      "num_tokens": 2935922.0,
+      "step": 2100
+    },
+    {
+      "entropy": 1.16827290058136,
+      "epoch": 2.419724770642202,
+      "grad_norm": 2.8285586833953857,
+      "learning_rate": 9.026953081631274e-05,
+      "loss": 0.3772,
+      "mean_token_accuracy": 0.862979942560196,
+      "num_tokens": 2950486.0,
+      "step": 2110
+    },
+    {
+      "entropy": 1.1864002346992493,
+      "epoch": 2.4311926605504586,
+      "grad_norm": 2.524240732192993,
+      "learning_rate": 9.015684643672469e-05,
+      "loss": 0.4183,
+      "mean_token_accuracy": 0.8481187999248505,
+      "num_tokens": 2964370.0,
+      "step": 2120
+    },
+    {
+      "entropy": 1.191727840900421,
+      "epoch": 2.4426605504587156,
+      "grad_norm": 2.7000691890716553,
+      "learning_rate": 9.00435845216903e-05,
+      "loss": 0.4099,
+      "mean_token_accuracy": 0.859676867723465,
+      "num_tokens": 2978586.0,
+      "step": 2130
+    },
+    {
+      "entropy": 1.196892774105072,
+      "epoch": 2.4541284403669725,
+      "grad_norm": 2.5411860942840576,
+      "learning_rate": 8.992974670014156e-05,
+      "loss": 0.4231,
+      "mean_token_accuracy": 0.853669410943985,
+      "num_tokens": 2993227.0,
+      "step": 2140
+    },
+    {
+      "entropy": 1.2281125068664551,
+      "epoch": 2.4655963302752295,
+      "grad_norm": 2.984402656555176,
+      "learning_rate": 8.98153346092931e-05,
+      "loss": 0.4914,
+      "mean_token_accuracy": 0.8256430447101593,
+      "num_tokens": 3006905.0,
+      "step": 2150
+    },
+    {
+      "entropy": 1.1872741818428039,
+      "epoch": 2.477064220183486,
+      "grad_norm": 3.6548094749450684,
+      "learning_rate": 8.970034989461869e-05,
+      "loss": 0.4204,
+      "mean_token_accuracy": 0.8485859632492065,
+      "num_tokens": 3020042.0,
+      "step": 2160
+    },
+    {
+      "entropy": 1.210246503353119,
+      "epoch": 2.488532110091743,
+      "grad_norm": 3.1821584701538086,
+      "learning_rate": 8.95847942098276e-05,
+      "loss": 0.4651,
+      "mean_token_accuracy": 0.839034765958786,
+      "num_tokens": 3034121.0,
+      "step": 2170
+    },
+    {
+      "entropy": 1.196474814414978,
+      "epoch": 2.5,
+      "grad_norm": 2.6702466011047363,
+      "learning_rate": 8.946866921684075e-05,
+      "loss": 0.4034,
+      "mean_token_accuracy": 0.853714382648468,
+      "num_tokens": 3048389.0,
+      "step": 2180
+    },
+    {
+      "entropy": 1.1994709730148316,
+      "epoch": 2.511467889908257,
+      "grad_norm": 2.4583847522735596,
+      "learning_rate": 8.935197658576688e-05,
+      "loss": 0.4547,
+      "mean_token_accuracy": 0.835651034116745,
+      "num_tokens": 3062510.0,
+      "step": 2190
+    },
+    {
+      "entropy": 1.1939351201057433,
+      "epoch": 2.522935779816514,
+      "grad_norm": 3.650007724761963,
+      "learning_rate": 8.923471799487848e-05,
+      "loss": 0.4858,
+      "mean_token_accuracy": 0.822267484664917,
+      "num_tokens": 3077006.0,
+      "step": 2200
+    },
+    {
+      "entropy": 1.1909420490264893,
+      "epoch": 2.5344036697247705,
+      "grad_norm": 2.5217745304107666,
+      "learning_rate": 8.911689513058767e-05,
+      "loss": 0.4471,
+      "mean_token_accuracy": 0.8431772708892822,
+      "num_tokens": 3090504.0,
+      "step": 2210
+    },
+    {
+      "entropy": 1.199105679988861,
+      "epoch": 2.5458715596330275,
+      "grad_norm": 3.0316340923309326,
+      "learning_rate": 8.899850968742196e-05,
+      "loss": 0.4777,
+      "mean_token_accuracy": 0.8287393450737,
+      "num_tokens": 3104342.0,
+      "step": 2220
+    },
+    {
+      "entropy": 1.2064455270767211,
+      "epoch": 2.5573394495412844,
+      "grad_norm": 3.974283218383789,
+      "learning_rate": 8.887956336799985e-05,
+      "loss": 0.4891,
+      "mean_token_accuracy": 0.8292845666408539,
+      "num_tokens": 3117829.0,
+      "step": 2230
+    },
+    {
+      "entropy": 1.1817766427993774,
+      "epoch": 2.5688073394495414,
+      "grad_norm": 2.7031972408294678,
+      "learning_rate": 8.876005788300634e-05,
+      "loss": 0.4419,
+      "mean_token_accuracy": 0.8345361471176147,
+      "num_tokens": 3131630.0,
+      "step": 2240
+    },
+    {
+      "entropy": 1.177594244480133,
+      "epoch": 2.580275229357798,
+      "grad_norm": 3.531320571899414,
+      "learning_rate": 8.863999495116839e-05,
+      "loss": 0.3869,
+      "mean_token_accuracy": 0.8643759608268737,
+      "num_tokens": 3145294.0,
+      "step": 2250
+    },
+    {
+      "entropy": 1.1700214266777038,
+      "epoch": 2.591743119266055,
+      "grad_norm": 3.8070144653320312,
+      "learning_rate": 8.851937629923012e-05,
+      "loss": 0.3935,
+      "mean_token_accuracy": 0.8605147182941437,
+      "num_tokens": 3158737.0,
+      "step": 2260
+    },
+    {
+      "entropy": 1.2061798334121705,
+      "epoch": 2.603211009174312,
+      "grad_norm": 2.8425440788269043,
+      "learning_rate": 8.839820366192802e-05,
+      "loss": 0.4542,
+      "mean_token_accuracy": 0.8374004244804383,
+      "num_tokens": 3172967.0,
+      "step": 2270
+    },
+    {
+      "entropy": 1.2195912718772888,
+      "epoch": 2.614678899082569,
+      "grad_norm": 5.081627368927002,
+      "learning_rate": 8.827647878196601e-05,
+      "loss": 0.4892,
+      "mean_token_accuracy": 0.825077348947525,
+      "num_tokens": 3186931.0,
+      "step": 2280
+    },
+    {
+      "entropy": 1.2038422107696534,
+      "epoch": 2.626146788990826,
+      "grad_norm": 2.543203592300415,
+      "learning_rate": 8.815420340999033e-05,
+      "loss": 0.4599,
+      "mean_token_accuracy": 0.8369208991527557,
+      "num_tokens": 3201522.0,
+      "step": 2290
+    },
+    {
+      "entropy": 1.1964881420135498,
+      "epoch": 2.6376146788990824,
+      "grad_norm": 2.4345703125,
+      "learning_rate": 8.803137930456443e-05,
+      "loss": 0.404,
+      "mean_token_accuracy": 0.8523661613464355,
+      "num_tokens": 3216119.0,
+      "step": 2300
+    },
+    {
+      "entropy": 1.1602870345115661,
+      "epoch": 2.6490825688073394,
+      "grad_norm": 3.4392335414886475,
+      "learning_rate": 8.790800823214358e-05,
+      "loss": 0.3756,
+      "mean_token_accuracy": 0.8643016874790191,
+      "num_tokens": 3230219.0,
+      "step": 2310
+    },
+    {
+      "entropy": 1.1903732061386108,
+      "epoch": 2.6605504587155964,
+      "grad_norm": 3.1603729724884033,
+      "learning_rate": 8.77840919670496e-05,
+      "loss": 0.4619,
+      "mean_token_accuracy": 0.8334128022193908,
+      "num_tokens": 3243964.0,
+      "step": 2320
+    },
+    {
+      "entropy": 1.2090648889541626,
+      "epoch": 2.6720183486238533,
+      "grad_norm": 3.2517998218536377,
+      "learning_rate": 8.765963229144523e-05,
+      "loss": 0.425,
+      "mean_token_accuracy": 0.848976331949234,
+      "num_tokens": 3258262.0,
+      "step": 2330
+    },
+    {
+      "entropy": 1.1725857734680176,
+      "epoch": 2.68348623853211,
+      "grad_norm": 2.936473846435547,
+      "learning_rate": 8.753463099530851e-05,
+      "loss": 0.4379,
+      "mean_token_accuracy": 0.8404418647289276,
+      "num_tokens": 3272235.0,
+      "step": 2340
+    },
+    {
+      "entropy": 1.2088525772094727,
+      "epoch": 2.694954128440367,
+      "grad_norm": 2.534141778945923,
+      "learning_rate": 8.74090898764071e-05,
+      "loss": 0.4689,
+      "mean_token_accuracy": 0.8306144773960114,
+      "num_tokens": 3285426.0,
+      "step": 2350
+    },
+    {
+      "entropy": 1.2100205898284913,
+      "epoch": 2.706422018348624,
+      "grad_norm": 3.1279990673065186,
+      "learning_rate": 8.728301074027237e-05,
+      "loss": 0.5061,
+      "mean_token_accuracy": 0.819576495885849,
+      "num_tokens": 3299795.0,
+      "step": 2360
+    },
+    {
+      "entropy": 1.2010661005973815,
+      "epoch": 2.717889908256881,
+      "grad_norm": 2.502993583679199,
+      "learning_rate": 8.715639540017348e-05,
+      "loss": 0.4144,
+      "mean_token_accuracy": 0.8534714758396149,
+      "num_tokens": 3314140.0,
+      "step": 2370
+    },
+    {
+      "entropy": 1.1825575947761535,
+      "epoch": 2.729357798165138,
+      "grad_norm": 3.2115747928619385,
+      "learning_rate": 8.70292456770912e-05,
+      "loss": 0.4104,
+      "mean_token_accuracy": 0.8631702959537506,
+      "num_tokens": 3328477.0,
+      "step": 2380
+    },
+    {
+      "entropy": 1.2106116533279419,
+      "epoch": 2.7408256880733948,
+      "grad_norm": 2.2947702407836914,
+      "learning_rate": 8.690156339969188e-05,
+      "loss": 0.452,
+      "mean_token_accuracy": 0.8402946293354034,
+      "num_tokens": 3342974.0,
+      "step": 2390
+    },
+    {
+      "entropy": 1.2192729353904723,
+      "epoch": 2.7522935779816513,
+      "grad_norm": 4.383372783660889,
+      "learning_rate": 8.677335040430098e-05,
+      "loss": 0.4587,
+      "mean_token_accuracy": 0.8375409781932831,
+      "num_tokens": 3356914.0,
+      "step": 2400
+    },
+    {
+      "entropy": 1.2175085544586182,
+      "epoch": 2.7637614678899083,
+      "grad_norm": 2.4807965755462646,
+      "learning_rate": 8.664460853487682e-05,
+      "loss": 0.4861,
+      "mean_token_accuracy": 0.8293013036251068,
+      "num_tokens": 3371546.0,
+      "step": 2410
+    },
+    {
+      "entropy": 1.1996549844741822,
+      "epoch": 2.7752293577981653,
+      "grad_norm": 2.7994184494018555,
+      "learning_rate": 8.651533964298391e-05,
+      "loss": 0.4615,
+      "mean_token_accuracy": 0.8339998602867127,
+      "num_tokens": 3385821.0,
+      "step": 2420
+    },
+    {
+      "entropy": 1.2070747137069702,
+      "epoch": 2.786697247706422,
+      "grad_norm": 2.6360669136047363,
+      "learning_rate": 8.638554558776645e-05,
+      "loss": 0.4724,
+      "mean_token_accuracy": 0.8334950864315033,
+      "num_tokens": 3399429.0,
+      "step": 2430
+    },
+    {
+      "entropy": 1.207094705104828,
+      "epoch": 2.7981651376146788,
+      "grad_norm": 3.541809320449829,
+      "learning_rate": 8.625522823592149e-05,
+      "loss": 0.4595,
+      "mean_token_accuracy": 0.8313855290412903,
+      "num_tokens": 3413081.0,
+      "step": 2440
+    },
+    {
+      "entropy": 1.165670096874237,
+      "epoch": 2.8096330275229358,
+      "grad_norm": 2.136975049972534,
+      "learning_rate": 8.612438946167216e-05,
+      "loss": 0.3889,
+      "mean_token_accuracy": 0.8570282876491546,
+      "num_tokens": 3426548.0,
+      "step": 2450
+    },
+    {
+      "entropy": 1.1852982997894288,
+      "epoch": 2.8211009174311927,
+      "grad_norm": 2.86348819732666,
+      "learning_rate": 8.599303114674069e-05,
+      "loss": 0.4249,
+      "mean_token_accuracy": 0.8503771364688874,
+      "num_tokens": 3440175.0,
+      "step": 2460
+    },
+    {
+      "entropy": 1.2077379465103149,
+      "epoch": 2.8325688073394497,
+      "grad_norm": 2.5795695781707764,
+      "learning_rate": 8.586115518032127e-05,
+      "loss": 0.4562,
+      "mean_token_accuracy": 0.8366669476032257,
+      "num_tokens": 3454525.0,
+      "step": 2470
+    },
+    {
+      "entropy": 1.1829241394996644,
+      "epoch": 2.8440366972477067,
+      "grad_norm": 2.239647388458252,
+      "learning_rate": 8.572876345905305e-05,
+      "loss": 0.3989,
+      "mean_token_accuracy": 0.8530926644802094,
+      "num_tokens": 3468835.0,
+      "step": 2480
+    },
+    {
+      "entropy": 1.1779277324676514,
+      "epoch": 2.8555045871559632,
+      "grad_norm": 2.7707083225250244,
+      "learning_rate": 8.55958578869927e-05,
+      "loss": 0.4161,
+      "mean_token_accuracy": 0.8491211295127868,
+      "num_tokens": 3482540.0,
+      "step": 2490
+    },
+    {
+      "entropy": 1.2271911025047302,
+      "epoch": 2.86697247706422,
+      "grad_norm": 3.0390429496765137,
+      "learning_rate": 8.546244037558709e-05,
+      "loss": 0.5198,
+      "mean_token_accuracy": 0.8124437749385833,
+      "num_tokens": 3496087.0,
+      "step": 2500
+    },
+    {
+      "entropy": 1.2103841185569764,
+      "epoch": 2.878440366972477,
+      "grad_norm": 2.824759006500244,
+      "learning_rate": 8.532851284364583e-05,
+      "loss": 0.4506,
+      "mean_token_accuracy": 0.8423313438892365,
+      "num_tokens": 3510218.0,
+      "step": 2510
+    },
+    {
+      "entropy": 1.2116459369659425,
+      "epoch": 2.8899082568807337,
+      "grad_norm": 2.6834771633148193,
+      "learning_rate": 8.519407721731358e-05,
+      "loss": 0.4716,
+      "mean_token_accuracy": 0.8302793622016906,
+      "num_tokens": 3524137.0,
+      "step": 2520
+    },
+    {
+      "entropy": 1.1814413189888,
+      "epoch": 2.9013761467889907,
+      "grad_norm": 2.3810577392578125,
+      "learning_rate": 8.505913543004249e-05,
+      "loss": 0.4289,
+      "mean_token_accuracy": 0.8481670498847962,
+      "num_tokens": 3538479.0,
+      "step": 2530
+    },
+    {
+      "entropy": 1.2182748675346375,
+      "epoch": 2.9128440366972477,
+      "grad_norm": 2.6762099266052246,
+      "learning_rate": 8.492368942256426e-05,
+      "loss": 0.4608,
+      "mean_token_accuracy": 0.8374363958835602,
+      "num_tokens": 3551130.0,
+      "step": 2540
+    },
+    {
+      "entropy": 1.1905839920043946,
+      "epoch": 2.9243119266055047,
+      "grad_norm": 2.8791279792785645,
+      "learning_rate": 8.478774114286228e-05,
+      "loss": 0.407,
+      "mean_token_accuracy": 0.8582445919513703,
+      "num_tokens": 3565579.0,
+      "step": 2550
+    },
+    {
+      "entropy": 1.2176141619682312,
+      "epoch": 2.9357798165137616,
+      "grad_norm": 3.3622496128082275,
+      "learning_rate": 8.465129254614364e-05,
+      "loss": 0.4531,
+      "mean_token_accuracy": 0.8369433999061584,
+      "num_tokens": 3579108.0,
+      "step": 2560
+    },
+    {
+      "entropy": 1.2156746864318848,
+      "epoch": 2.9472477064220186,
+      "grad_norm": 2.5315442085266113,
+      "learning_rate": 8.451434559481099e-05,
+      "loss": 0.414,
+      "mean_token_accuracy": 0.8529948651790619,
+      "num_tokens": 3593159.0,
+      "step": 2570
+    },
+    {
+      "entropy": 1.1980236649513245,
+      "epoch": 2.958715596330275,
+      "grad_norm": 3.484841823577881,
+      "learning_rate": 8.437690225843426e-05,
+      "loss": 0.4092,
+      "mean_token_accuracy": 0.8564953148365021,
+      "num_tokens": 3607158.0,
+      "step": 2580
+    },
+    {
+      "entropy": 1.1944978356361389,
+      "epoch": 2.970183486238532,
+      "grad_norm": 2.7124505043029785,
+      "learning_rate": 8.423896451372245e-05,
+      "loss": 0.4692,
+      "mean_token_accuracy": 0.8257829248905182,
+      "num_tokens": 3620742.0,
+      "step": 2590
+    },
+    {
+      "entropy": 1.1990766882896424,
+      "epoch": 2.981651376146789,
+      "grad_norm": 2.3464643955230713,
+      "learning_rate": 8.41005343444951e-05,
+      "loss": 0.4531,
+      "mean_token_accuracy": 0.833730137348175,
+      "num_tokens": 3634908.0,
+      "step": 2600
+    },
+    {
+      "entropy": 1.1942741513252257,
+      "epoch": 2.9931192660550456,
+      "grad_norm": 2.8104026317596436,
+      "learning_rate": 8.396161374165379e-05,
+      "loss": 0.4836,
+      "mean_token_accuracy": 0.8267854213714599,
+      "num_tokens": 3648446.0,
+      "step": 2610
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5552682098270106e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40c5baddcc79c12d9ba5fea4c312ba84dbb44fb7ed9042e3e2a6d74cb4852642
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2180c7767659e7428e28d9a9ccf952ff9277226f7fe3322e822b44b304999421
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896