agu18dec commited on about 1 month ago

Commit

e968c5f

verified ·

1 Parent(s): c24b69b

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/README.md +61 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/trainer_state.json +1874 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/trainer_state.json +2794 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/trainer_state.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -320,3 +320,14 @@ checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_baseline/tokenizer.json f
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2241/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2490/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2241/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2490/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-5520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-6440/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-7360/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-8280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-920/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/4bc95c7e)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.19.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.9.1
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0523104214bf52665e06f3d4f2914a483131c9fc5d65947216d43af9c5a3c78
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e7089da8d9f7f02bd579baee830e61c329d1ed80ef19e391eec66c111cc675e
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1874 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1840,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2980000615119933,
+      "epoch": 0.010869565217391304,
+      "grad_norm": 7.514286994934082,
+      "learning_rate": 1.956521739130435e-06,
+      "loss": 1.8548,
+      "mean_token_accuracy": 0.5365569293498993,
+      "num_tokens": 13273.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.3184159398078918,
+      "epoch": 0.021739130434782608,
+      "grad_norm": 6.582128524780273,
+      "learning_rate": 4.130434782608695e-06,
+      "loss": 1.9416,
+      "mean_token_accuracy": 0.5010036021471024,
+      "num_tokens": 26299.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.302778995037079,
+      "epoch": 0.03260869565217391,
+      "grad_norm": 6.661994457244873,
+      "learning_rate": 6.304347826086957e-06,
+      "loss": 1.7644,
+      "mean_token_accuracy": 0.5327741354703903,
+      "num_tokens": 39608.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.355496096611023,
+      "epoch": 0.043478260869565216,
+      "grad_norm": 2.829239845275879,
+      "learning_rate": 8.478260869565217e-06,
+      "loss": 1.5473,
+      "mean_token_accuracy": 0.5216561764478683,
+      "num_tokens": 52279.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.4233315467834473,
+      "epoch": 0.05434782608695652,
+      "grad_norm": 1.384964108467102,
+      "learning_rate": 1.0652173913043479e-05,
+      "loss": 1.3512,
+      "mean_token_accuracy": 0.5346131652593613,
+      "num_tokens": 65371.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.4783715605735779,
+      "epoch": 0.06521739130434782,
+      "grad_norm": 1.2184863090515137,
+      "learning_rate": 1.2826086956521741e-05,
+      "loss": 1.3353,
+      "mean_token_accuracy": 0.5265826016664505,
+      "num_tokens": 78549.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.4811665654182433,
+      "epoch": 0.07608695652173914,
+      "grad_norm": 0.8817082047462463,
+      "learning_rate": 1.5e-05,
+      "loss": 1.2885,
+      "mean_token_accuracy": 0.5369167566299439,
+      "num_tokens": 91168.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.485330879688263,
+      "epoch": 0.08695652173913043,
+      "grad_norm": 1.0375007390975952,
+      "learning_rate": 1.7173913043478263e-05,
+      "loss": 1.3207,
+      "mean_token_accuracy": 0.5182694345712662,
+      "num_tokens": 104210.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.4509355902671814,
+      "epoch": 0.09782608695652174,
+      "grad_norm": 0.866616427898407,
+      "learning_rate": 1.9347826086956523e-05,
+      "loss": 1.2442,
+      "mean_token_accuracy": 0.5508454263210296,
+      "num_tokens": 117342.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.4595998883247376,
+      "epoch": 0.10869565217391304,
+      "grad_norm": 0.9921526312828064,
+      "learning_rate": 2.1521739130434784e-05,
+      "loss": 1.2513,
+      "mean_token_accuracy": 0.5439675092697144,
+      "num_tokens": 130168.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.468259596824646,
+      "epoch": 0.11956521739130435,
+      "grad_norm": 0.8542688488960266,
+      "learning_rate": 2.3695652173913045e-05,
+      "loss": 1.2523,
+      "mean_token_accuracy": 0.5456153243780136,
+      "num_tokens": 143277.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.4652462244033813,
+      "epoch": 0.13043478260869565,
+      "grad_norm": 0.8958607316017151,
+      "learning_rate": 2.5869565217391305e-05,
+      "loss": 1.2564,
+      "mean_token_accuracy": 0.5374186933040619,
+      "num_tokens": 155929.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.4442671895027162,
+      "epoch": 0.14130434782608695,
+      "grad_norm": 1.0437828302383423,
+      "learning_rate": 2.8043478260869566e-05,
+      "loss": 1.2463,
+      "mean_token_accuracy": 0.5506911396980285,
+      "num_tokens": 168922.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.442794382572174,
+      "epoch": 0.15217391304347827,
+      "grad_norm": 1.1950273513793945,
+      "learning_rate": 3.0217391304347827e-05,
+      "loss": 1.2343,
+      "mean_token_accuracy": 0.561489287018776,
+      "num_tokens": 181883.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.4483441829681396,
+      "epoch": 0.16304347826086957,
+      "grad_norm": 1.27411687374115,
+      "learning_rate": 3.239130434782609e-05,
+      "loss": 1.2515,
+      "mean_token_accuracy": 0.5461658954620361,
+      "num_tokens": 194847.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.4531602740287781,
+      "epoch": 0.17391304347826086,
+      "grad_norm": 0.9844512343406677,
+      "learning_rate": 3.456521739130435e-05,
+      "loss": 1.2379,
+      "mean_token_accuracy": 0.5450588703155518,
+      "num_tokens": 207431.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.4460819005966186,
+      "epoch": 0.18478260869565216,
+      "grad_norm": 0.965182363986969,
+      "learning_rate": 3.673913043478261e-05,
+      "loss": 1.2497,
+      "mean_token_accuracy": 0.5442000389099121,
+      "num_tokens": 220382.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.4696384787559509,
+      "epoch": 0.1956521739130435,
+      "grad_norm": 0.8425037860870361,
+      "learning_rate": 3.8913043478260866e-05,
+      "loss": 1.2847,
+      "mean_token_accuracy": 0.5304420441389084,
+      "num_tokens": 232940.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.4491984724998475,
+      "epoch": 0.20652173913043478,
+      "grad_norm": 1.1692280769348145,
+      "learning_rate": 4.1086956521739134e-05,
+      "loss": 1.2342,
+      "mean_token_accuracy": 0.5570813834667205,
+      "num_tokens": 245747.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.466271436214447,
+      "epoch": 0.21739130434782608,
+      "grad_norm": 1.0157368183135986,
+      "learning_rate": 4.3260869565217394e-05,
+      "loss": 1.2499,
+      "mean_token_accuracy": 0.5432725459337234,
+      "num_tokens": 258696.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.4768565893173218,
+      "epoch": 0.22826086956521738,
+      "grad_norm": 1.109692096710205,
+      "learning_rate": 4.5434782608695655e-05,
+      "loss": 1.2343,
+      "mean_token_accuracy": 0.5567020237445831,
+      "num_tokens": 271378.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.4473167181015014,
+      "epoch": 0.2391304347826087,
+      "grad_norm": 0.850563108921051,
+      "learning_rate": 4.7608695652173916e-05,
+      "loss": 1.1959,
+      "mean_token_accuracy": 0.5724921762943268,
+      "num_tokens": 284704.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.4478083968162536,
+      "epoch": 0.25,
+      "grad_norm": 1.0289748907089233,
+      "learning_rate": 4.9782608695652176e-05,
+      "loss": 1.2392,
+      "mean_token_accuracy": 0.5519216269254684,
+      "num_tokens": 296961.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.4915278434753418,
+      "epoch": 0.2608695652173913,
+      "grad_norm": 1.3161778450012207,
+      "learning_rate": 5.195652173913044e-05,
+      "loss": 1.2539,
+      "mean_token_accuracy": 0.5443875581026077,
+      "num_tokens": 310082.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.4435262322425841,
+      "epoch": 0.2717391304347826,
+      "grad_norm": 1.2697113752365112,
+      "learning_rate": 5.41304347826087e-05,
+      "loss": 1.1911,
+      "mean_token_accuracy": 0.576522421836853,
+      "num_tokens": 323044.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.4607349276542663,
+      "epoch": 0.2826086956521739,
+      "grad_norm": 0.8006339073181152,
+      "learning_rate": 5.630434782608696e-05,
+      "loss": 1.2088,
+      "mean_token_accuracy": 0.5584357857704163,
+      "num_tokens": 336108.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.4630830883979797,
+      "epoch": 0.29347826086956524,
+      "grad_norm": 0.8462095856666565,
+      "learning_rate": 5.847826086956521e-05,
+      "loss": 1.2458,
+      "mean_token_accuracy": 0.5520103573799133,
+      "num_tokens": 349210.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.458599328994751,
+      "epoch": 0.30434782608695654,
+      "grad_norm": 0.930942177772522,
+      "learning_rate": 6.0652173913043487e-05,
+      "loss": 1.2219,
+      "mean_token_accuracy": 0.5603324949741364,
+      "num_tokens": 361465.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.4730467200279236,
+      "epoch": 0.31521739130434784,
+      "grad_norm": 0.9836443066596985,
+      "learning_rate": 6.282608695652175e-05,
+      "loss": 1.2493,
+      "mean_token_accuracy": 0.5466845005750656,
+      "num_tokens": 374931.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.4596371173858642,
+      "epoch": 0.32608695652173914,
+      "grad_norm": 0.9860939383506775,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.2174,
+      "mean_token_accuracy": 0.556469538807869,
+      "num_tokens": 387929.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.4784631490707398,
+      "epoch": 0.33695652173913043,
+      "grad_norm": 0.8261193037033081,
+      "learning_rate": 6.717391304347827e-05,
+      "loss": 1.2191,
+      "mean_token_accuracy": 0.5600455164909363,
+      "num_tokens": 401392.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.4627429723739624,
+      "epoch": 0.34782608695652173,
+      "grad_norm": 0.896903395652771,
+      "learning_rate": 6.934782608695653e-05,
+      "loss": 1.1987,
+      "mean_token_accuracy": 0.5688268154859543,
+      "num_tokens": 414466.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.481223452091217,
+      "epoch": 0.358695652173913,
+      "grad_norm": 0.9765130877494812,
+      "learning_rate": 7.152173913043479e-05,
+      "loss": 1.2161,
+      "mean_token_accuracy": 0.5661008894443512,
+      "num_tokens": 427231.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.5086342811584472,
+      "epoch": 0.3695652173913043,
+      "grad_norm": 0.8136937022209167,
+      "learning_rate": 7.369565217391304e-05,
+      "loss": 1.2884,
+      "mean_token_accuracy": 0.5307422339916229,
+      "num_tokens": 439856.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.4857903122901917,
+      "epoch": 0.3804347826086957,
+      "grad_norm": 0.913378894329071,
+      "learning_rate": 7.58695652173913e-05,
+      "loss": 1.2631,
+      "mean_token_accuracy": 0.5459983497858047,
+      "num_tokens": 452683.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.5006132960319518,
+      "epoch": 0.391304347826087,
+      "grad_norm": 1.0260237455368042,
+      "learning_rate": 7.804347826086957e-05,
+      "loss": 1.2587,
+      "mean_token_accuracy": 0.5429587304592133,
+      "num_tokens": 465274.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.4861261010169984,
+      "epoch": 0.40217391304347827,
+      "grad_norm": 1.04011869430542,
+      "learning_rate": 8.021739130434783e-05,
+      "loss": 1.2147,
+      "mean_token_accuracy": 0.5620492398738861,
+      "num_tokens": 478175.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.4943390011787414,
+      "epoch": 0.41304347826086957,
+      "grad_norm": 0.9155416488647461,
+      "learning_rate": 8.23913043478261e-05,
+      "loss": 1.2128,
+      "mean_token_accuracy": 0.5667012810707093,
+      "num_tokens": 491001.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.5116252064704896,
+      "epoch": 0.42391304347826086,
+      "grad_norm": 0.8238904476165771,
+      "learning_rate": 8.456521739130435e-05,
+      "loss": 1.2677,
+      "mean_token_accuracy": 0.5370148032903671,
+      "num_tokens": 503764.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.4961246132850647,
+      "epoch": 0.43478260869565216,
+      "grad_norm": 0.8830587863922119,
+      "learning_rate": 8.673913043478261e-05,
+      "loss": 1.1999,
+      "mean_token_accuracy": 0.5743164956569672,
+      "num_tokens": 516294.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.5065942287445069,
+      "epoch": 0.44565217391304346,
+      "grad_norm": 0.9117815494537354,
+      "learning_rate": 8.891304347826088e-05,
+      "loss": 1.2607,
+      "mean_token_accuracy": 0.550678727030754,
+      "num_tokens": 529384.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.5079344272613526,
+      "epoch": 0.45652173913043476,
+      "grad_norm": 0.8730387091636658,
+      "learning_rate": 9.108695652173914e-05,
+      "loss": 1.2087,
+      "mean_token_accuracy": 0.5660586059093475,
+      "num_tokens": 542010.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.5147196769714355,
+      "epoch": 0.4673913043478261,
+      "grad_norm": 0.7791972160339355,
+      "learning_rate": 9.32608695652174e-05,
+      "loss": 1.2471,
+      "mean_token_accuracy": 0.5513437986373901,
+      "num_tokens": 554428.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.5182705640792846,
+      "epoch": 0.4782608695652174,
+      "grad_norm": 0.7569729089736938,
+      "learning_rate": 9.543478260869566e-05,
+      "loss": 1.2876,
+      "mean_token_accuracy": 0.5325394898653031,
+      "num_tokens": 567462.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.5036618828773498,
+      "epoch": 0.4891304347826087,
+      "grad_norm": 0.7794932126998901,
+      "learning_rate": 9.760869565217392e-05,
+      "loss": 1.2539,
+      "mean_token_accuracy": 0.5439064025878906,
+      "num_tokens": 580377.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.4947790503501892,
+      "epoch": 0.5,
+      "grad_norm": 0.8008731007575989,
+      "learning_rate": 9.978260869565218e-05,
+      "loss": 1.2352,
+      "mean_token_accuracy": 0.5524563610553741,
+      "num_tokens": 593597.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.5091761827468873,
+      "epoch": 0.5108695652173914,
+      "grad_norm": 0.9790273904800415,
+      "learning_rate": 9.999973836157333e-05,
+      "loss": 1.2448,
+      "mean_token_accuracy": 0.5587224543094635,
+      "num_tokens": 606659.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.5102417111396789,
+      "epoch": 0.5217391304347826,
+      "grad_norm": 0.9725663065910339,
+      "learning_rate": 9.999883393595947e-05,
+      "loss": 1.2366,
+      "mean_token_accuracy": 0.555170550942421,
+      "num_tokens": 619406.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.5230854153633118,
+      "epoch": 0.532608695652174,
+      "grad_norm": 1.0150320529937744,
+      "learning_rate": 9.999728350473721e-05,
+      "loss": 1.2304,
+      "mean_token_accuracy": 0.5601270943880081,
+      "num_tokens": 632194.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.5028501629829407,
+      "epoch": 0.5434782608695652,
+      "grad_norm": 0.8656931519508362,
+      "learning_rate": 9.99950870879387e-05,
+      "loss": 1.2286,
+      "mean_token_accuracy": 0.5571267485618592,
+      "num_tokens": 645327.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.5302343845367432,
+      "epoch": 0.5543478260869565,
+      "grad_norm": 0.7537740468978882,
+      "learning_rate": 9.99922447139426e-05,
+      "loss": 1.2342,
+      "mean_token_accuracy": 0.5612987399101257,
+      "num_tokens": 658378.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.5025633692741394,
+      "epoch": 0.5652173913043478,
+      "grad_norm": 0.6478719115257263,
+      "learning_rate": 9.998875641947354e-05,
+      "loss": 1.2429,
+      "mean_token_accuracy": 0.5501718163490296,
+      "num_tokens": 671323.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.5004254579544067,
+      "epoch": 0.5760869565217391,
+      "grad_norm": 1.2102432250976562,
+      "learning_rate": 9.998462224960175e-05,
+      "loss": 1.213,
+      "mean_token_accuracy": 0.5621294498443603,
+      "num_tokens": 683878.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.5195318818092347,
+      "epoch": 0.5869565217391305,
+      "grad_norm": 0.7961319088935852,
+      "learning_rate": 9.997984225774238e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5559745967388153,
+      "num_tokens": 696935.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.5280181407928466,
+      "epoch": 0.5978260869565217,
+      "grad_norm": 0.8740176558494568,
+      "learning_rate": 9.99744165056549e-05,
+      "loss": 1.2197,
+      "mean_token_accuracy": 0.5634395360946656,
+      "num_tokens": 710020.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.5327624678611755,
+      "epoch": 0.6086956521739131,
+      "grad_norm": 0.8462045192718506,
+      "learning_rate": 9.99683450634423e-05,
+      "loss": 1.2192,
+      "mean_token_accuracy": 0.5612434148788452,
+      "num_tokens": 723303.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.5094027161598205,
+      "epoch": 0.6195652173913043,
+      "grad_norm": 0.9465392231941223,
+      "learning_rate": 9.996162800955011e-05,
+      "loss": 1.1817,
+      "mean_token_accuracy": 0.5782815992832184,
+      "num_tokens": 735527.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.5501951813697814,
+      "epoch": 0.6304347826086957,
+      "grad_norm": 0.7445736527442932,
+      "learning_rate": 9.995426543076545e-05,
+      "loss": 1.2452,
+      "mean_token_accuracy": 0.5505437403917313,
+      "num_tokens": 748455.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.5227739214897156,
+      "epoch": 0.6413043478260869,
+      "grad_norm": 0.8378339409828186,
+      "learning_rate": 9.994625742221586e-05,
+      "loss": 1.2551,
+      "mean_token_accuracy": 0.5548771649599076,
+      "num_tokens": 761420.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.5428736090660096,
+      "epoch": 0.6521739130434783,
+      "grad_norm": 0.9249877333641052,
+      "learning_rate": 9.993760408736814e-05,
+      "loss": 1.282,
+      "mean_token_accuracy": 0.5393997848033905,
+      "num_tokens": 773676.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.5630847930908203,
+      "epoch": 0.6630434782608695,
+      "grad_norm": 0.8152625560760498,
+      "learning_rate": 9.992830553802696e-05,
+      "loss": 1.2763,
+      "mean_token_accuracy": 0.5402287989854813,
+      "num_tokens": 786757.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.532595145702362,
+      "epoch": 0.6739130434782609,
+      "grad_norm": 0.7313966751098633,
+      "learning_rate": 9.991836189433342e-05,
+      "loss": 1.2323,
+      "mean_token_accuracy": 0.5645015567541123,
+      "num_tokens": 799851.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.5160420179367065,
+      "epoch": 0.6847826086956522,
+      "grad_norm": 0.7158486843109131,
+      "learning_rate": 9.990777328476348e-05,
+      "loss": 1.2021,
+      "mean_token_accuracy": 0.555733984708786,
+      "num_tokens": 812648.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.5084859609603882,
+      "epoch": 0.6956521739130435,
+      "grad_norm": 0.7056333422660828,
+      "learning_rate": 9.98965398461264e-05,
+      "loss": 1.176,
+      "mean_token_accuracy": 0.5772889316082,
+      "num_tokens": 825054.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.5172175765037537,
+      "epoch": 0.7065217391304348,
+      "grad_norm": 0.8173061013221741,
+      "learning_rate": 9.988466172356282e-05,
+      "loss": 1.1893,
+      "mean_token_accuracy": 0.5774871349334717,
+      "num_tokens": 838148.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.5155822157859802,
+      "epoch": 0.717391304347826,
+      "grad_norm": 0.7483378648757935,
+      "learning_rate": 9.9872139070543e-05,
+      "loss": 1.2377,
+      "mean_token_accuracy": 0.5529649972915649,
+      "num_tokens": 851079.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.5392202258110046,
+      "epoch": 0.7282608695652174,
+      "grad_norm": 0.8020080924034119,
+      "learning_rate": 9.985897204886481e-05,
+      "loss": 1.2471,
+      "mean_token_accuracy": 0.5591055184602738,
+      "num_tokens": 863673.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.520468044281006,
+      "epoch": 0.7391304347826086,
+      "grad_norm": 0.7957432866096497,
+      "learning_rate": 9.984516082865159e-05,
+      "loss": 1.2582,
+      "mean_token_accuracy": 0.5404952645301819,
+      "num_tokens": 876764.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.5024335741996766,
+      "epoch": 0.75,
+      "grad_norm": 0.8745436072349548,
+      "learning_rate": 9.983070558835002e-05,
+      "loss": 1.2029,
+      "mean_token_accuracy": 0.5673643052577972,
+      "num_tokens": 889851.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.5227225780487061,
+      "epoch": 0.7608695652173914,
+      "grad_norm": 0.7866286039352417,
+      "learning_rate": 9.981560651472781e-05,
+      "loss": 1.2597,
+      "mean_token_accuracy": 0.5447615504264831,
+      "num_tokens": 903182.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.5330517888069153,
+      "epoch": 0.7717391304347826,
+      "grad_norm": 0.696030855178833,
+      "learning_rate": 9.97998638028712e-05,
+      "loss": 1.2417,
+      "mean_token_accuracy": 0.5569504171609878,
+      "num_tokens": 916564.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.5023605585098267,
+      "epoch": 0.782608695652174,
+      "grad_norm": 0.7980480194091797,
+      "learning_rate": 9.978347765618257e-05,
+      "loss": 1.2073,
+      "mean_token_accuracy": 0.562690931558609,
+      "num_tokens": 929820.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.5466702461242676,
+      "epoch": 0.7934782608695652,
+      "grad_norm": 0.8441147804260254,
+      "learning_rate": 9.976644828637767e-05,
+      "loss": 1.2859,
+      "mean_token_accuracy": 0.5330282121896743,
+      "num_tokens": 942449.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.515641415119171,
+      "epoch": 0.8043478260869565,
+      "grad_norm": 0.8833957314491272,
+      "learning_rate": 9.974877591348304e-05,
+      "loss": 1.2627,
+      "mean_token_accuracy": 0.5418030679225921,
+      "num_tokens": 955620.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.5292868852615356,
+      "epoch": 0.8152173913043478,
+      "grad_norm": 0.8666150569915771,
+      "learning_rate": 9.973046076583301e-05,
+      "loss": 1.2364,
+      "mean_token_accuracy": 0.5494832009077072,
+      "num_tokens": 968954.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.5135694026947022,
+      "epoch": 0.8260869565217391,
+      "grad_norm": 0.9172241687774658,
+      "learning_rate": 9.97115030800669e-05,
+      "loss": 1.2053,
+      "mean_token_accuracy": 0.5607668071985245,
+      "num_tokens": 981323.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.5214222908020019,
+      "epoch": 0.8369565217391305,
+      "grad_norm": 0.9353718161582947,
+      "learning_rate": 9.969190310112579e-05,
+      "loss": 1.225,
+      "mean_token_accuracy": 0.5599299073219299,
+      "num_tokens": 994834.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.5419185280799865,
+      "epoch": 0.8478260869565217,
+      "grad_norm": 0.717232882976532,
+      "learning_rate": 9.967166108224957e-05,
+      "loss": 1.2848,
+      "mean_token_accuracy": 0.5360999226570129,
+      "num_tokens": 1007806.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.545407807826996,
+      "epoch": 0.8586956521739131,
+      "grad_norm": 0.745928943157196,
+      "learning_rate": 9.965077728497348e-05,
+      "loss": 1.2683,
+      "mean_token_accuracy": 0.5427737534046173,
+      "num_tokens": 1021093.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.5416621446609498,
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.8545331954956055,
+      "learning_rate": 9.96292519791248e-05,
+      "loss": 1.3036,
+      "mean_token_accuracy": 0.5352708637714386,
+      "num_tokens": 1034317.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.516196882724762,
+      "epoch": 0.8804347826086957,
+      "grad_norm": 0.8239868879318237,
+      "learning_rate": 9.96070854428194e-05,
+      "loss": 1.1943,
+      "mean_token_accuracy": 0.568702632188797,
+      "num_tokens": 1047679.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.5478002548217773,
+      "epoch": 0.8913043478260869,
+      "grad_norm": 0.9187906980514526,
+      "learning_rate": 9.958427796245808e-05,
+      "loss": 1.2707,
+      "mean_token_accuracy": 0.5460701882839203,
+      "num_tokens": 1060840.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.540980589389801,
+      "epoch": 0.9021739130434783,
+      "grad_norm": 0.774869978427887,
+      "learning_rate": 9.956082983272293e-05,
+      "loss": 1.2397,
+      "mean_token_accuracy": 0.5464379012584686,
+      "num_tokens": 1073529.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.5131322622299195,
+      "epoch": 0.9130434782608695,
+      "grad_norm": 1.029721975326538,
+      "learning_rate": 9.953674135657345e-05,
+      "loss": 1.2198,
+      "mean_token_accuracy": 0.5641603857278824,
+      "num_tokens": 1086600.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.5259755611419679,
+      "epoch": 0.9239130434782609,
+      "grad_norm": 0.8057295083999634,
+      "learning_rate": 9.951201284524275e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5562368750572204,
+      "num_tokens": 1099737.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.5159748077392579,
+      "epoch": 0.9347826086956522,
+      "grad_norm": 0.6001420617103577,
+      "learning_rate": 9.94866446182334e-05,
+      "loss": 1.2524,
+      "mean_token_accuracy": 0.5458084315061569,
+      "num_tokens": 1112239.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.524741494655609,
+      "epoch": 0.9456521739130435,
+      "grad_norm": 0.847523033618927,
+      "learning_rate": 9.94606370033134e-05,
+      "loss": 1.2245,
+      "mean_token_accuracy": 0.5601188719272614,
+      "num_tokens": 1125191.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.5371912598609925,
+      "epoch": 0.9565217391304348,
+      "grad_norm": 0.767745852470398,
+      "learning_rate": 9.943399033651189e-05,
+      "loss": 1.2319,
+      "mean_token_accuracy": 0.5546965420246124,
+      "num_tokens": 1138077.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.5223184943199157,
+      "epoch": 0.967391304347826,
+      "grad_norm": 0.9313151836395264,
+      "learning_rate": 9.94067049621148e-05,
+      "loss": 1.2237,
+      "mean_token_accuracy": 0.5578917026519775,
+      "num_tokens": 1151364.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.530699372291565,
+      "epoch": 0.9782608695652174,
+      "grad_norm": 0.7053420543670654,
+      "learning_rate": 9.937878123266044e-05,
+      "loss": 1.2269,
+      "mean_token_accuracy": 0.5488695651292801,
+      "num_tokens": 1164326.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.5197353124618531,
+      "epoch": 0.9891304347826086,
+      "grad_norm": 0.9986150860786438,
+      "learning_rate": 9.9350219508935e-05,
+      "loss": 1.2106,
+      "mean_token_accuracy": 0.5582584798336029,
+      "num_tokens": 1176914.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.5376808762550354,
+      "epoch": 1.0,
+      "grad_norm": 0.7129160165786743,
+      "learning_rate": 9.93210201599677e-05,
+      "loss": 1.2377,
+      "mean_token_accuracy": 0.557282817363739,
+      "num_tokens": 1189994.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.5522505402565003,
+      "epoch": 1.0108695652173914,
+      "grad_norm": 0.9139987230300903,
+      "learning_rate": 9.929118356302621e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5444983661174774,
+      "num_tokens": 1202961.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.5519829273223877,
+      "epoch": 1.0217391304347827,
+      "grad_norm": 1.0422664880752563,
+      "learning_rate": 9.926071010361173e-05,
+      "loss": 1.1957,
+      "mean_token_accuracy": 0.5779279708862305,
+      "num_tokens": 1215901.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.5434082865715026,
+      "epoch": 1.0326086956521738,
+      "grad_norm": 1.0472567081451416,
+      "learning_rate": 9.922960017545395e-05,
+      "loss": 1.2263,
+      "mean_token_accuracy": 0.5640866041183472,
+      "num_tokens": 1228567.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.5352994203567505,
+      "epoch": 1.0434782608695652,
+      "grad_norm": 1.0810585021972656,
+      "learning_rate": 9.919785418050598e-05,
+      "loss": 1.1876,
+      "mean_token_accuracy": 0.5709751307964325,
+      "num_tokens": 1241529.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.4996863842010497,
+      "epoch": 1.0543478260869565,
+      "grad_norm": 1.1204661130905151,
+      "learning_rate": 9.916547252893923e-05,
+      "loss": 1.1354,
+      "mean_token_accuracy": 0.5961336076259613,
+      "num_tokens": 1254137.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.5315279841423035,
+      "epoch": 1.065217391304348,
+      "grad_norm": 1.0741767883300781,
+      "learning_rate": 9.9132455639138e-05,
+      "loss": 1.1422,
+      "mean_token_accuracy": 0.5875493228435517,
+      "num_tokens": 1266871.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.516059410572052,
+      "epoch": 1.0760869565217392,
+      "grad_norm": 1.1965429782867432,
+      "learning_rate": 9.90988039376942e-05,
+      "loss": 1.1438,
+      "mean_token_accuracy": 0.5906685352325439,
+      "num_tokens": 1279655.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.5148675918579102,
+      "epoch": 1.0869565217391304,
+      "grad_norm": 1.1992353200912476,
+      "learning_rate": 9.906451785940167e-05,
+      "loss": 1.1636,
+      "mean_token_accuracy": 0.5710582077503205,
+      "num_tokens": 1292202.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.5187682271003724,
+      "epoch": 1.0978260869565217,
+      "grad_norm": 1.0606764554977417,
+      "learning_rate": 9.902959784725077e-05,
+      "loss": 1.1763,
+      "mean_token_accuracy": 0.5760969400405884,
+      "num_tokens": 1305284.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.5265469312667848,
+      "epoch": 1.108695652173913,
+      "grad_norm": 1.02944815158844,
+      "learning_rate": 9.899404435242246e-05,
+      "loss": 1.2096,
+      "mean_token_accuracy": 0.5624277234077454,
+      "num_tokens": 1318408.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.5455774545669556,
+      "epoch": 1.1195652173913044,
+      "grad_norm": 1.1493759155273438,
+      "learning_rate": 9.895785783428262e-05,
+      "loss": 1.1652,
+      "mean_token_accuracy": 0.5867336988449097,
+      "num_tokens": 1331156.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.5371973156929015,
+      "epoch": 1.1304347826086956,
+      "grad_norm": 0.9468239545822144,
+      "learning_rate": 9.8921038760376e-05,
+      "loss": 1.2371,
+      "mean_token_accuracy": 0.5544474184513092,
+      "num_tokens": 1343904.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.5403631925582886,
+      "epoch": 1.141304347826087,
+      "grad_norm": 1.1717609167099,
+      "learning_rate": 9.888358760642029e-05,
+      "loss": 1.1394,
+      "mean_token_accuracy": 0.5933512449264526,
+      "num_tokens": 1356797.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.5518387794494628,
+      "epoch": 1.1521739130434783,
+      "grad_norm": 1.2024801969528198,
+      "learning_rate": 9.884550485629987e-05,
+      "loss": 1.2065,
+      "mean_token_accuracy": 0.5667118012905121,
+      "num_tokens": 1369690.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.5736596703529357,
+      "epoch": 1.1630434782608696,
+      "grad_norm": 1.0323596000671387,
+      "learning_rate": 9.88067910020596e-05,
+      "loss": 1.2124,
+      "mean_token_accuracy": 0.5691272497177124,
+      "num_tokens": 1382561.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.57814359664917,
+      "epoch": 1.1739130434782608,
+      "grad_norm": 1.1128944158554077,
+      "learning_rate": 9.876744654389854e-05,
+      "loss": 1.2319,
+      "mean_token_accuracy": 0.554848113656044,
+      "num_tokens": 1395409.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.5651036262512208,
+      "epoch": 1.184782608695652,
+      "grad_norm": 1.1131497621536255,
+      "learning_rate": 9.872747199016328e-05,
+      "loss": 1.1995,
+      "mean_token_accuracy": 0.5680587291717529,
+      "num_tokens": 1408511.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.519801914691925,
+      "epoch": 1.1956521739130435,
+      "grad_norm": 0.8381641507148743,
+      "learning_rate": 9.868686785734165e-05,
+      "loss": 1.1729,
+      "mean_token_accuracy": 0.5780038118362427,
+      "num_tokens": 1421328.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.5411308765411378,
+      "epoch": 1.2065217391304348,
+      "grad_norm": 1.1784008741378784,
+      "learning_rate": 9.86456346700558e-05,
+      "loss": 1.2026,
+      "mean_token_accuracy": 0.5581619143486023,
+      "num_tokens": 1434644.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.524932038784027,
+      "epoch": 1.2173913043478262,
+      "grad_norm": 0.9289618730545044,
+      "learning_rate": 9.860377296105556e-05,
+      "loss": 1.219,
+      "mean_token_accuracy": 0.557993471622467,
+      "num_tokens": 1447469.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.5029574513435364,
+      "epoch": 1.2282608695652173,
+      "grad_norm": 1.0168135166168213,
+      "learning_rate": 9.856128327121155e-05,
+      "loss": 1.1589,
+      "mean_token_accuracy": 0.578672569990158,
+      "num_tokens": 1460202.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.5095925211906434,
+      "epoch": 1.2391304347826086,
+      "grad_norm": 1.052454948425293,
+      "learning_rate": 9.85181661495081e-05,
+      "loss": 1.2232,
+      "mean_token_accuracy": 0.5522898703813552,
+      "num_tokens": 1473114.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.5059074401855468,
+      "epoch": 1.25,
+      "grad_norm": 1.20883309841156,
+      "learning_rate": 9.847442215303626e-05,
+      "loss": 1.2172,
+      "mean_token_accuracy": 0.5659465253353119,
+      "num_tokens": 1485990.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.494919514656067,
+      "epoch": 1.2608695652173914,
+      "grad_norm": 1.1653634309768677,
+      "learning_rate": 9.843005184698655e-05,
+      "loss": 1.1817,
+      "mean_token_accuracy": 0.5764101088047028,
+      "num_tokens": 1498939.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.5184181690216065,
+      "epoch": 1.2717391304347827,
+      "grad_norm": 1.1174242496490479,
+      "learning_rate": 9.838505580464168e-05,
+      "loss": 1.1976,
+      "mean_token_accuracy": 0.5707351744174958,
+      "num_tokens": 1511943.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.5217233657836915,
+      "epoch": 1.2826086956521738,
+      "grad_norm": 1.0029795169830322,
+      "learning_rate": 9.833943460736912e-05,
+      "loss": 1.2296,
+      "mean_token_accuracy": 0.5572409898042678,
+      "num_tokens": 1525135.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.514461922645569,
+      "epoch": 1.2934782608695652,
+      "grad_norm": 1.2473056316375732,
+      "learning_rate": 9.829318884461359e-05,
+      "loss": 1.221,
+      "mean_token_accuracy": 0.5566778779029846,
+      "num_tokens": 1537699.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.5298507332801818,
+      "epoch": 1.3043478260869565,
+      "grad_norm": 1.068049430847168,
+      "learning_rate": 9.824631911388948e-05,
+      "loss": 1.248,
+      "mean_token_accuracy": 0.5430671572685242,
+      "num_tokens": 1550938.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.5463980197906495,
+      "epoch": 1.315217391304348,
+      "grad_norm": 1.0760388374328613,
+      "learning_rate": 9.819882602077309e-05,
+      "loss": 1.2825,
+      "mean_token_accuracy": 0.5330462247133255,
+      "num_tokens": 1563597.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.5457575082778932,
+      "epoch": 1.3260869565217392,
+      "grad_norm": 1.1161272525787354,
+      "learning_rate": 9.815071017889482e-05,
+      "loss": 1.2598,
+      "mean_token_accuracy": 0.543943053483963,
+      "num_tokens": 1576301.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.5349620819091796,
+      "epoch": 1.3369565217391304,
+      "grad_norm": 1.1779778003692627,
+      "learning_rate": 9.810197220993123e-05,
+      "loss": 1.2551,
+      "mean_token_accuracy": 0.5386941403150558,
+      "num_tokens": 1589776.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.5158817052841187,
+      "epoch": 1.3478260869565217,
+      "grad_norm": 1.1150175333023071,
+      "learning_rate": 9.805261274359705e-05,
+      "loss": 1.193,
+      "mean_token_accuracy": 0.5642519950866699,
+      "num_tokens": 1602239.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.512274718284607,
+      "epoch": 1.358695652173913,
+      "grad_norm": 0.9392043948173523,
+      "learning_rate": 9.800263241763698e-05,
+      "loss": 1.2334,
+      "mean_token_accuracy": 0.5577278465032578,
+      "num_tokens": 1615621.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.5087523460388184,
+      "epoch": 1.3695652173913042,
+      "grad_norm": 0.9521236419677734,
+      "learning_rate": 9.795203187781751e-05,
+      "loss": 1.1651,
+      "mean_token_accuracy": 0.5836262464523315,
+      "num_tokens": 1628741.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.5212602257728576,
+      "epoch": 1.3804347826086958,
+      "grad_norm": 0.9689566493034363,
+      "learning_rate": 9.790081177791852e-05,
+      "loss": 1.1944,
+      "mean_token_accuracy": 0.572248637676239,
+      "num_tokens": 1641646.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.521955931186676,
+      "epoch": 1.391304347826087,
+      "grad_norm": 1.016711711883545,
+      "learning_rate": 9.784897277972491e-05,
+      "loss": 1.2105,
+      "mean_token_accuracy": 0.5605559885501862,
+      "num_tokens": 1654499.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.5126453638076782,
+      "epoch": 1.4021739130434783,
+      "grad_norm": 1.1951313018798828,
+      "learning_rate": 9.779651555301794e-05,
+      "loss": 1.2305,
+      "mean_token_accuracy": 0.5537042915821075,
+      "num_tokens": 1667748.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.528828752040863,
+      "epoch": 1.4130434782608696,
+      "grad_norm": 1.1385231018066406,
+      "learning_rate": 9.77434407755667e-05,
+      "loss": 1.2294,
+      "mean_token_accuracy": 0.554050150513649,
+      "num_tokens": 1681184.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.510583758354187,
+      "epoch": 1.4239130434782608,
+      "grad_norm": 1.0576328039169312,
+      "learning_rate": 9.768974913311922e-05,
+      "loss": 1.2674,
+      "mean_token_accuracy": 0.5414516568183899,
+      "num_tokens": 1693818.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.5150775551795959,
+      "epoch": 1.434782608695652,
+      "grad_norm": 1.3364728689193726,
+      "learning_rate": 9.763544131939374e-05,
+      "loss": 1.2075,
+      "mean_token_accuracy": 0.559964632987976,
+      "num_tokens": 1706939.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.5151704668998718,
+      "epoch": 1.4456521739130435,
+      "grad_norm": 1.02871835231781,
+      "learning_rate": 9.758051803606971e-05,
+      "loss": 1.2487,
+      "mean_token_accuracy": 0.552227908372879,
+      "num_tokens": 1719315.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.5152636528015138,
+      "epoch": 1.4565217391304348,
+      "grad_norm": 1.0097824335098267,
+      "learning_rate": 9.75249799927786e-05,
+      "loss": 1.2263,
+      "mean_token_accuracy": 0.5533849179744721,
+      "num_tokens": 1731891.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.512537384033203,
+      "epoch": 1.4673913043478262,
+      "grad_norm": 1.2632033824920654,
+      "learning_rate": 9.746882790709491e-05,
+      "loss": 1.222,
+      "mean_token_accuracy": 0.5614925265312195,
+      "num_tokens": 1744427.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.5295302748680115,
+      "epoch": 1.4782608695652173,
+      "grad_norm": 1.113368034362793,
+      "learning_rate": 9.741206250452683e-05,
+      "loss": 1.2735,
+      "mean_token_accuracy": 0.539223712682724,
+      "num_tokens": 1757083.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.536200964450836,
+      "epoch": 1.4891304347826086,
+      "grad_norm": 1.1522810459136963,
+      "learning_rate": 9.735468451850681e-05,
+      "loss": 1.2152,
+      "mean_token_accuracy": 0.565186282992363,
+      "num_tokens": 1769982.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.495800745487213,
+      "epoch": 1.5,
+      "grad_norm": 1.2632598876953125,
+      "learning_rate": 9.729669469038216e-05,
+      "loss": 1.1635,
+      "mean_token_accuracy": 0.5871178984642029,
+      "num_tokens": 1783102.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.535517191886902,
+      "epoch": 1.5108695652173914,
+      "grad_norm": 0.9593290090560913,
+      "learning_rate": 9.723809376940544e-05,
+      "loss": 1.2108,
+      "mean_token_accuracy": 0.5709479689598084,
+      "num_tokens": 1796398.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.529611337184906,
+      "epoch": 1.5217391304347827,
+      "grad_norm": 1.0819748640060425,
+      "learning_rate": 9.717888251272477e-05,
+      "loss": 1.1972,
+      "mean_token_accuracy": 0.5633429378271103,
+      "num_tokens": 1809379.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.5493282318115233,
+      "epoch": 1.5326086956521738,
+      "grad_norm": 0.9472999572753906,
+      "learning_rate": 9.71190616853741e-05,
+      "loss": 1.2616,
+      "mean_token_accuracy": 0.5486618399620056,
+      "num_tokens": 1822664.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.4989375710487365,
+      "epoch": 1.5434782608695652,
+      "grad_norm": 1.2883214950561523,
+      "learning_rate": 9.705863206026321e-05,
+      "loss": 1.2137,
+      "mean_token_accuracy": 0.558601850271225,
+      "num_tokens": 1835336.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.5061516761779785,
+      "epoch": 1.5543478260869565,
+      "grad_norm": 0.9577755928039551,
+      "learning_rate": 9.699759441816787e-05,
+      "loss": 1.1739,
+      "mean_token_accuracy": 0.577557110786438,
+      "num_tokens": 1847755.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.5141437649726868,
+      "epoch": 1.5652173913043477,
+      "grad_norm": 1.0751005411148071,
+      "learning_rate": 9.693594954771965e-05,
+      "loss": 1.231,
+      "mean_token_accuracy": 0.5506497710943222,
+      "num_tokens": 1860302.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.5419356107711792,
+      "epoch": 1.5760869565217392,
+      "grad_norm": 1.0141667127609253,
+      "learning_rate": 9.687369824539577e-05,
+      "loss": 1.2788,
+      "mean_token_accuracy": 0.5303231775760651,
+      "num_tokens": 1873093.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.520876133441925,
+      "epoch": 1.5869565217391304,
+      "grad_norm": 1.109215259552002,
+      "learning_rate": 9.68108413155088e-05,
+      "loss": 1.2333,
+      "mean_token_accuracy": 0.5601014912128448,
+      "num_tokens": 1886177.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.4981224894523621,
+      "epoch": 1.5978260869565217,
+      "grad_norm": 0.9200493097305298,
+      "learning_rate": 9.674737957019624e-05,
+      "loss": 1.1852,
+      "mean_token_accuracy": 0.5700576066970825,
+      "num_tokens": 1899113.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.5140800833702088,
+      "epoch": 1.608695652173913,
+      "grad_norm": 1.190007209777832,
+      "learning_rate": 9.66833138294101e-05,
+      "loss": 1.1929,
+      "mean_token_accuracy": 0.5691904962062836,
+      "num_tokens": 1912474.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.5299779295921325,
+      "epoch": 1.6195652173913042,
+      "grad_norm": 0.9787003397941589,
+      "learning_rate": 9.661864492090625e-05,
+      "loss": 1.2179,
+      "mean_token_accuracy": 0.553766930103302,
+      "num_tokens": 1925685.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.5431510925292968,
+      "epoch": 1.6304347826086958,
+      "grad_norm": 1.1734333038330078,
+      "learning_rate": 9.655337368023371e-05,
+      "loss": 1.2108,
+      "mean_token_accuracy": 0.5539384454488754,
+      "num_tokens": 1938610.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.5246233105659486,
+      "epoch": 1.641304347826087,
+      "grad_norm": 1.072691559791565,
+      "learning_rate": 9.64875009507239e-05,
+      "loss": 1.1999,
+      "mean_token_accuracy": 0.5761029601097107,
+      "num_tokens": 1951241.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.538881742954254,
+      "epoch": 1.6521739130434783,
+      "grad_norm": 1.0783456563949585,
+      "learning_rate": 9.642102758347973e-05,
+      "loss": 1.2443,
+      "mean_token_accuracy": 0.5502734839916229,
+      "num_tokens": 1964816.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.550068199634552,
+      "epoch": 1.6630434782608696,
+      "grad_norm": 1.0582056045532227,
+      "learning_rate": 9.63539544373646e-05,
+      "loss": 1.2182,
+      "mean_token_accuracy": 0.5598388969898224,
+      "num_tokens": 1977930.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.5344447016716003,
+      "epoch": 1.6739130434782608,
+      "grad_norm": 0.9788505434989929,
+      "learning_rate": 9.628628237899126e-05,
+      "loss": 1.1852,
+      "mean_token_accuracy": 0.5595145970582962,
+      "num_tokens": 1991032.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.5468374967575074,
+      "epoch": 1.6847826086956523,
+      "grad_norm": 1.0464048385620117,
+      "learning_rate": 9.621801228271073e-05,
+      "loss": 1.2175,
+      "mean_token_accuracy": 0.5616866886615753,
+      "num_tokens": 2004207.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.5422045588493347,
+      "epoch": 1.6956521739130435,
+      "grad_norm": 0.8307158946990967,
+      "learning_rate": 9.614914503060083e-05,
+      "loss": 1.2202,
+      "mean_token_accuracy": 0.5515525698661804,
+      "num_tokens": 2016969.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.5343055129051208,
+      "epoch": 1.7065217391304348,
+      "grad_norm": 1.198614239692688,
+      "learning_rate": 9.607968151245498e-05,
+      "loss": 1.1866,
+      "mean_token_accuracy": 0.5771215856075287,
+      "num_tokens": 2029750.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.5321205615997315,
+      "epoch": 1.7173913043478262,
+      "grad_norm": 0.9247676134109497,
+      "learning_rate": 9.600962262577053e-05,
+      "loss": 1.2205,
+      "mean_token_accuracy": 0.5626431256532669,
+      "num_tokens": 2043181.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.541359269618988,
+      "epoch": 1.7282608695652173,
+      "grad_norm": 1.0934436321258545,
+      "learning_rate": 9.593896927573728e-05,
+      "loss": 1.2397,
+      "mean_token_accuracy": 0.5406488478183746,
+      "num_tokens": 2056541.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.5094799280166626,
+      "epoch": 1.7391304347826086,
+      "grad_norm": 0.8803229928016663,
+      "learning_rate": 9.586772237522573e-05,
+      "loss": 1.2047,
+      "mean_token_accuracy": 0.5659328937530518,
+      "num_tokens": 2069752.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.4946965217590331,
+      "epoch": 1.75,
+      "grad_norm": 1.0182883739471436,
+      "learning_rate": 9.579588284477526e-05,
+      "loss": 1.1492,
+      "mean_token_accuracy": 0.5829819083213806,
+      "num_tokens": 2083119.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.483540380001068,
+      "epoch": 1.7608695652173914,
+      "grad_norm": 1.2263387441635132,
+      "learning_rate": 9.572345161258235e-05,
+      "loss": 1.1474,
+      "mean_token_accuracy": 0.5895151972770691,
+      "num_tokens": 2095862.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.5072382926940917,
+      "epoch": 1.7717391304347827,
+      "grad_norm": 0.8639858365058899,
+      "learning_rate": 9.565042961448844e-05,
+      "loss": 1.1997,
+      "mean_token_accuracy": 0.5625985980033874,
+      "num_tokens": 2108549.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.5286765098571777,
+      "epoch": 1.7826086956521738,
+      "grad_norm": 1.0652116537094116,
+      "learning_rate": 9.557681779396797e-05,
+      "loss": 1.2253,
+      "mean_token_accuracy": 0.5569576025009155,
+      "num_tokens": 2120871.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.5006824493408204,
+      "epoch": 1.7934782608695652,
+      "grad_norm": 0.949942946434021,
+      "learning_rate": 9.550261710211608e-05,
+      "loss": 1.1973,
+      "mean_token_accuracy": 0.5634852379560471,
+      "num_tokens": 2134097.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.5050195574760437,
+      "epoch": 1.8043478260869565,
+      "grad_norm": 1.016350507736206,
+      "learning_rate": 9.542782849763637e-05,
+      "loss": 1.1709,
+      "mean_token_accuracy": 0.5780033886432647,
+      "num_tokens": 2147811.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.5177413702011109,
+      "epoch": 1.8152173913043477,
+      "grad_norm": 1.264799952507019,
+      "learning_rate": 9.535245294682857e-05,
+      "loss": 1.2513,
+      "mean_token_accuracy": 0.5521585702896118,
+      "num_tokens": 2160506.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.5374733328819274,
+      "epoch": 1.8260869565217392,
+      "grad_norm": 1.1713751554489136,
+      "learning_rate": 9.527649142357596e-05,
+      "loss": 1.2708,
+      "mean_token_accuracy": 0.5314113944768906,
+      "num_tokens": 2173328.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.5083181142807007,
+      "epoch": 1.8369565217391304,
+      "grad_norm": 1.1553071737289429,
+      "learning_rate": 9.519994490933279e-05,
+      "loss": 1.206,
+      "mean_token_accuracy": 0.5680734992027283,
+      "num_tokens": 2186452.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.5291340470314025,
+      "epoch": 1.8478260869565217,
+      "grad_norm": 1.1443442106246948,
+      "learning_rate": 9.51228143931117e-05,
+      "loss": 1.2351,
+      "mean_token_accuracy": 0.5539528131484985,
+      "num_tokens": 2199594.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.5204999327659607,
+      "epoch": 1.858695652173913,
+      "grad_norm": 1.1584019660949707,
+      "learning_rate": 9.504510087147088e-05,
+      "loss": 1.2338,
+      "mean_token_accuracy": 0.5519226849079132,
+      "num_tokens": 2212135.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.5614403247833253,
+      "epoch": 1.8695652173913042,
+      "grad_norm": 1.0798224210739136,
+      "learning_rate": 9.496680534850113e-05,
+      "loss": 1.2534,
+      "mean_token_accuracy": 0.5530328571796417,
+      "num_tokens": 2225159.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.5276212096214294,
+      "epoch": 1.8804347826086958,
+      "grad_norm": 1.1296766996383667,
+      "learning_rate": 9.488792883581299e-05,
+      "loss": 1.1784,
+      "mean_token_accuracy": 0.5774711936712265,
+      "num_tokens": 2238139.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.544056522846222,
+      "epoch": 1.891304347826087,
+      "grad_norm": 1.1214172840118408,
+      "learning_rate": 9.480847235252361e-05,
+      "loss": 1.2268,
+      "mean_token_accuracy": 0.5613886952400208,
+      "num_tokens": 2250928.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.5295695900917052,
+      "epoch": 1.9021739130434783,
+      "grad_norm": 1.1650352478027344,
+      "learning_rate": 9.472843692524363e-05,
+      "loss": 1.1573,
+      "mean_token_accuracy": 0.5787465155124665,
+      "num_tokens": 2263338.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.5347764611244201,
+      "epoch": 1.9130434782608696,
+      "grad_norm": 1.0249896049499512,
+      "learning_rate": 9.464782358806383e-05,
+      "loss": 1.1731,
+      "mean_token_accuracy": 0.5780636668205261,
+      "num_tokens": 2276200.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.5715635061264037,
+      "epoch": 1.9239130434782608,
+      "grad_norm": 1.0768051147460938,
+      "learning_rate": 9.45666333825419e-05,
+      "loss": 1.2585,
+      "mean_token_accuracy": 0.5452336609363556,
+      "num_tokens": 2289088.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.5402274131774902,
+      "epoch": 1.9347826086956523,
+      "grad_norm": 1.0846654176712036,
+      "learning_rate": 9.448486735768884e-05,
+      "loss": 1.1918,
+      "mean_token_accuracy": 0.5699589729309082,
+      "num_tokens": 2302544.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.5048401594161986,
+      "epoch": 1.9456521739130435,
+      "grad_norm": 1.1533433198928833,
+      "learning_rate": 9.440252656995551e-05,
+      "loss": 1.1792,
+      "mean_token_accuracy": 0.5685461640357972,
+      "num_tokens": 2315473.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.5128441214561463,
+      "epoch": 1.9565217391304348,
+      "grad_norm": 1.2847894430160522,
+      "learning_rate": 9.431961208321892e-05,
+      "loss": 1.1566,
+      "mean_token_accuracy": 0.5870453357696533,
+      "num_tokens": 2329176.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.5362990856170655,
+      "epoch": 1.9673913043478262,
+      "grad_norm": 1.2497868537902832,
+      "learning_rate": 9.423612496876855e-05,
+      "loss": 1.1896,
+      "mean_token_accuracy": 0.5719706892967225,
+      "num_tokens": 2341591.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.5580734014511108,
+      "epoch": 1.9782608695652173,
+      "grad_norm": 1.1140056848526,
+      "learning_rate": 9.415206630529241e-05,
+      "loss": 1.2434,
+      "mean_token_accuracy": 0.5461874425411224,
+      "num_tokens": 2354577.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.5499179720878602,
+      "epoch": 1.9891304347826086,
+      "grad_norm": 1.0708650350570679,
+      "learning_rate": 9.406743717886321e-05,
+      "loss": 1.1635,
+      "mean_token_accuracy": 0.5835445284843445,
+      "num_tokens": 2366934.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.5282660722732544,
+      "epoch": 2.0,
+      "grad_norm": 0.9982873797416687,
+      "learning_rate": 9.398223868292424e-05,
+      "loss": 1.162,
+      "mean_token_accuracy": 0.5795026063919068,
+      "num_tokens": 2379988.0,
+      "step": 1840
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0125617378081587e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe3124146d4d0372b460431bd6b6771f3c9e6e5f34e80127ceca6056b0fbd2b2
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2794 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 2760,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2980000615119933,
+      "epoch": 0.010869565217391304,
+      "grad_norm": 7.514286994934082,
+      "learning_rate": 1.956521739130435e-06,
+      "loss": 1.8548,
+      "mean_token_accuracy": 0.5365569293498993,
+      "num_tokens": 13273.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.3184159398078918,
+      "epoch": 0.021739130434782608,
+      "grad_norm": 6.582128524780273,
+      "learning_rate": 4.130434782608695e-06,
+      "loss": 1.9416,
+      "mean_token_accuracy": 0.5010036021471024,
+      "num_tokens": 26299.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.302778995037079,
+      "epoch": 0.03260869565217391,
+      "grad_norm": 6.661994457244873,
+      "learning_rate": 6.304347826086957e-06,
+      "loss": 1.7644,
+      "mean_token_accuracy": 0.5327741354703903,
+      "num_tokens": 39608.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.355496096611023,
+      "epoch": 0.043478260869565216,
+      "grad_norm": 2.829239845275879,
+      "learning_rate": 8.478260869565217e-06,
+      "loss": 1.5473,
+      "mean_token_accuracy": 0.5216561764478683,
+      "num_tokens": 52279.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.4233315467834473,
+      "epoch": 0.05434782608695652,
+      "grad_norm": 1.384964108467102,
+      "learning_rate": 1.0652173913043479e-05,
+      "loss": 1.3512,
+      "mean_token_accuracy": 0.5346131652593613,
+      "num_tokens": 65371.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.4783715605735779,
+      "epoch": 0.06521739130434782,
+      "grad_norm": 1.2184863090515137,
+      "learning_rate": 1.2826086956521741e-05,
+      "loss": 1.3353,
+      "mean_token_accuracy": 0.5265826016664505,
+      "num_tokens": 78549.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.4811665654182433,
+      "epoch": 0.07608695652173914,
+      "grad_norm": 0.8817082047462463,
+      "learning_rate": 1.5e-05,
+      "loss": 1.2885,
+      "mean_token_accuracy": 0.5369167566299439,
+      "num_tokens": 91168.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.485330879688263,
+      "epoch": 0.08695652173913043,
+      "grad_norm": 1.0375007390975952,
+      "learning_rate": 1.7173913043478263e-05,
+      "loss": 1.3207,
+      "mean_token_accuracy": 0.5182694345712662,
+      "num_tokens": 104210.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.4509355902671814,
+      "epoch": 0.09782608695652174,
+      "grad_norm": 0.866616427898407,
+      "learning_rate": 1.9347826086956523e-05,
+      "loss": 1.2442,
+      "mean_token_accuracy": 0.5508454263210296,
+      "num_tokens": 117342.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.4595998883247376,
+      "epoch": 0.10869565217391304,
+      "grad_norm": 0.9921526312828064,
+      "learning_rate": 2.1521739130434784e-05,
+      "loss": 1.2513,
+      "mean_token_accuracy": 0.5439675092697144,
+      "num_tokens": 130168.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.468259596824646,
+      "epoch": 0.11956521739130435,
+      "grad_norm": 0.8542688488960266,
+      "learning_rate": 2.3695652173913045e-05,
+      "loss": 1.2523,
+      "mean_token_accuracy": 0.5456153243780136,
+      "num_tokens": 143277.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.4652462244033813,
+      "epoch": 0.13043478260869565,
+      "grad_norm": 0.8958607316017151,
+      "learning_rate": 2.5869565217391305e-05,
+      "loss": 1.2564,
+      "mean_token_accuracy": 0.5374186933040619,
+      "num_tokens": 155929.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.4442671895027162,
+      "epoch": 0.14130434782608695,
+      "grad_norm": 1.0437828302383423,
+      "learning_rate": 2.8043478260869566e-05,
+      "loss": 1.2463,
+      "mean_token_accuracy": 0.5506911396980285,
+      "num_tokens": 168922.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.442794382572174,
+      "epoch": 0.15217391304347827,
+      "grad_norm": 1.1950273513793945,
+      "learning_rate": 3.0217391304347827e-05,
+      "loss": 1.2343,
+      "mean_token_accuracy": 0.561489287018776,
+      "num_tokens": 181883.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.4483441829681396,
+      "epoch": 0.16304347826086957,
+      "grad_norm": 1.27411687374115,
+      "learning_rate": 3.239130434782609e-05,
+      "loss": 1.2515,
+      "mean_token_accuracy": 0.5461658954620361,
+      "num_tokens": 194847.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.4531602740287781,
+      "epoch": 0.17391304347826086,
+      "grad_norm": 0.9844512343406677,
+      "learning_rate": 3.456521739130435e-05,
+      "loss": 1.2379,
+      "mean_token_accuracy": 0.5450588703155518,
+      "num_tokens": 207431.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.4460819005966186,
+      "epoch": 0.18478260869565216,
+      "grad_norm": 0.965182363986969,
+      "learning_rate": 3.673913043478261e-05,
+      "loss": 1.2497,
+      "mean_token_accuracy": 0.5442000389099121,
+      "num_tokens": 220382.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.4696384787559509,
+      "epoch": 0.1956521739130435,
+      "grad_norm": 0.8425037860870361,
+      "learning_rate": 3.8913043478260866e-05,
+      "loss": 1.2847,
+      "mean_token_accuracy": 0.5304420441389084,
+      "num_tokens": 232940.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.4491984724998475,
+      "epoch": 0.20652173913043478,
+      "grad_norm": 1.1692280769348145,
+      "learning_rate": 4.1086956521739134e-05,
+      "loss": 1.2342,
+      "mean_token_accuracy": 0.5570813834667205,
+      "num_tokens": 245747.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.466271436214447,
+      "epoch": 0.21739130434782608,
+      "grad_norm": 1.0157368183135986,
+      "learning_rate": 4.3260869565217394e-05,
+      "loss": 1.2499,
+      "mean_token_accuracy": 0.5432725459337234,
+      "num_tokens": 258696.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.4768565893173218,
+      "epoch": 0.22826086956521738,
+      "grad_norm": 1.109692096710205,
+      "learning_rate": 4.5434782608695655e-05,
+      "loss": 1.2343,
+      "mean_token_accuracy": 0.5567020237445831,
+      "num_tokens": 271378.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.4473167181015014,
+      "epoch": 0.2391304347826087,
+      "grad_norm": 0.850563108921051,
+      "learning_rate": 4.7608695652173916e-05,
+      "loss": 1.1959,
+      "mean_token_accuracy": 0.5724921762943268,
+      "num_tokens": 284704.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.4478083968162536,
+      "epoch": 0.25,
+      "grad_norm": 1.0289748907089233,
+      "learning_rate": 4.9782608695652176e-05,
+      "loss": 1.2392,
+      "mean_token_accuracy": 0.5519216269254684,
+      "num_tokens": 296961.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.4915278434753418,
+      "epoch": 0.2608695652173913,
+      "grad_norm": 1.3161778450012207,
+      "learning_rate": 5.195652173913044e-05,
+      "loss": 1.2539,
+      "mean_token_accuracy": 0.5443875581026077,
+      "num_tokens": 310082.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.4435262322425841,
+      "epoch": 0.2717391304347826,
+      "grad_norm": 1.2697113752365112,
+      "learning_rate": 5.41304347826087e-05,
+      "loss": 1.1911,
+      "mean_token_accuracy": 0.576522421836853,
+      "num_tokens": 323044.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.4607349276542663,
+      "epoch": 0.2826086956521739,
+      "grad_norm": 0.8006339073181152,
+      "learning_rate": 5.630434782608696e-05,
+      "loss": 1.2088,
+      "mean_token_accuracy": 0.5584357857704163,
+      "num_tokens": 336108.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.4630830883979797,
+      "epoch": 0.29347826086956524,
+      "grad_norm": 0.8462095856666565,
+      "learning_rate": 5.847826086956521e-05,
+      "loss": 1.2458,
+      "mean_token_accuracy": 0.5520103573799133,
+      "num_tokens": 349210.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.458599328994751,
+      "epoch": 0.30434782608695654,
+      "grad_norm": 0.930942177772522,
+      "learning_rate": 6.0652173913043487e-05,
+      "loss": 1.2219,
+      "mean_token_accuracy": 0.5603324949741364,
+      "num_tokens": 361465.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.4730467200279236,
+      "epoch": 0.31521739130434784,
+      "grad_norm": 0.9836443066596985,
+      "learning_rate": 6.282608695652175e-05,
+      "loss": 1.2493,
+      "mean_token_accuracy": 0.5466845005750656,
+      "num_tokens": 374931.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.4596371173858642,
+      "epoch": 0.32608695652173914,
+      "grad_norm": 0.9860939383506775,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.2174,
+      "mean_token_accuracy": 0.556469538807869,
+      "num_tokens": 387929.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.4784631490707398,
+      "epoch": 0.33695652173913043,
+      "grad_norm": 0.8261193037033081,
+      "learning_rate": 6.717391304347827e-05,
+      "loss": 1.2191,
+      "mean_token_accuracy": 0.5600455164909363,
+      "num_tokens": 401392.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.4627429723739624,
+      "epoch": 0.34782608695652173,
+      "grad_norm": 0.896903395652771,
+      "learning_rate": 6.934782608695653e-05,
+      "loss": 1.1987,
+      "mean_token_accuracy": 0.5688268154859543,
+      "num_tokens": 414466.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.481223452091217,
+      "epoch": 0.358695652173913,
+      "grad_norm": 0.9765130877494812,
+      "learning_rate": 7.152173913043479e-05,
+      "loss": 1.2161,
+      "mean_token_accuracy": 0.5661008894443512,
+      "num_tokens": 427231.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.5086342811584472,
+      "epoch": 0.3695652173913043,
+      "grad_norm": 0.8136937022209167,
+      "learning_rate": 7.369565217391304e-05,
+      "loss": 1.2884,
+      "mean_token_accuracy": 0.5307422339916229,
+      "num_tokens": 439856.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.4857903122901917,
+      "epoch": 0.3804347826086957,
+      "grad_norm": 0.913378894329071,
+      "learning_rate": 7.58695652173913e-05,
+      "loss": 1.2631,
+      "mean_token_accuracy": 0.5459983497858047,
+      "num_tokens": 452683.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.5006132960319518,
+      "epoch": 0.391304347826087,
+      "grad_norm": 1.0260237455368042,
+      "learning_rate": 7.804347826086957e-05,
+      "loss": 1.2587,
+      "mean_token_accuracy": 0.5429587304592133,
+      "num_tokens": 465274.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.4861261010169984,
+      "epoch": 0.40217391304347827,
+      "grad_norm": 1.04011869430542,
+      "learning_rate": 8.021739130434783e-05,
+      "loss": 1.2147,
+      "mean_token_accuracy": 0.5620492398738861,
+      "num_tokens": 478175.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.4943390011787414,
+      "epoch": 0.41304347826086957,
+      "grad_norm": 0.9155416488647461,
+      "learning_rate": 8.23913043478261e-05,
+      "loss": 1.2128,
+      "mean_token_accuracy": 0.5667012810707093,
+      "num_tokens": 491001.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.5116252064704896,
+      "epoch": 0.42391304347826086,
+      "grad_norm": 0.8238904476165771,
+      "learning_rate": 8.456521739130435e-05,
+      "loss": 1.2677,
+      "mean_token_accuracy": 0.5370148032903671,
+      "num_tokens": 503764.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.4961246132850647,
+      "epoch": 0.43478260869565216,
+      "grad_norm": 0.8830587863922119,
+      "learning_rate": 8.673913043478261e-05,
+      "loss": 1.1999,
+      "mean_token_accuracy": 0.5743164956569672,
+      "num_tokens": 516294.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.5065942287445069,
+      "epoch": 0.44565217391304346,
+      "grad_norm": 0.9117815494537354,
+      "learning_rate": 8.891304347826088e-05,
+      "loss": 1.2607,
+      "mean_token_accuracy": 0.550678727030754,
+      "num_tokens": 529384.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.5079344272613526,
+      "epoch": 0.45652173913043476,
+      "grad_norm": 0.8730387091636658,
+      "learning_rate": 9.108695652173914e-05,
+      "loss": 1.2087,
+      "mean_token_accuracy": 0.5660586059093475,
+      "num_tokens": 542010.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.5147196769714355,
+      "epoch": 0.4673913043478261,
+      "grad_norm": 0.7791972160339355,
+      "learning_rate": 9.32608695652174e-05,
+      "loss": 1.2471,
+      "mean_token_accuracy": 0.5513437986373901,
+      "num_tokens": 554428.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.5182705640792846,
+      "epoch": 0.4782608695652174,
+      "grad_norm": 0.7569729089736938,
+      "learning_rate": 9.543478260869566e-05,
+      "loss": 1.2876,
+      "mean_token_accuracy": 0.5325394898653031,
+      "num_tokens": 567462.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.5036618828773498,
+      "epoch": 0.4891304347826087,
+      "grad_norm": 0.7794932126998901,
+      "learning_rate": 9.760869565217392e-05,
+      "loss": 1.2539,
+      "mean_token_accuracy": 0.5439064025878906,
+      "num_tokens": 580377.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.4947790503501892,
+      "epoch": 0.5,
+      "grad_norm": 0.8008731007575989,
+      "learning_rate": 9.978260869565218e-05,
+      "loss": 1.2352,
+      "mean_token_accuracy": 0.5524563610553741,
+      "num_tokens": 593597.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.5091761827468873,
+      "epoch": 0.5108695652173914,
+      "grad_norm": 0.9790273904800415,
+      "learning_rate": 9.999973836157333e-05,
+      "loss": 1.2448,
+      "mean_token_accuracy": 0.5587224543094635,
+      "num_tokens": 606659.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.5102417111396789,
+      "epoch": 0.5217391304347826,
+      "grad_norm": 0.9725663065910339,
+      "learning_rate": 9.999883393595947e-05,
+      "loss": 1.2366,
+      "mean_token_accuracy": 0.555170550942421,
+      "num_tokens": 619406.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.5230854153633118,
+      "epoch": 0.532608695652174,
+      "grad_norm": 1.0150320529937744,
+      "learning_rate": 9.999728350473721e-05,
+      "loss": 1.2304,
+      "mean_token_accuracy": 0.5601270943880081,
+      "num_tokens": 632194.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.5028501629829407,
+      "epoch": 0.5434782608695652,
+      "grad_norm": 0.8656931519508362,
+      "learning_rate": 9.99950870879387e-05,
+      "loss": 1.2286,
+      "mean_token_accuracy": 0.5571267485618592,
+      "num_tokens": 645327.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.5302343845367432,
+      "epoch": 0.5543478260869565,
+      "grad_norm": 0.7537740468978882,
+      "learning_rate": 9.99922447139426e-05,
+      "loss": 1.2342,
+      "mean_token_accuracy": 0.5612987399101257,
+      "num_tokens": 658378.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.5025633692741394,
+      "epoch": 0.5652173913043478,
+      "grad_norm": 0.6478719115257263,
+      "learning_rate": 9.998875641947354e-05,
+      "loss": 1.2429,
+      "mean_token_accuracy": 0.5501718163490296,
+      "num_tokens": 671323.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.5004254579544067,
+      "epoch": 0.5760869565217391,
+      "grad_norm": 1.2102432250976562,
+      "learning_rate": 9.998462224960175e-05,
+      "loss": 1.213,
+      "mean_token_accuracy": 0.5621294498443603,
+      "num_tokens": 683878.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.5195318818092347,
+      "epoch": 0.5869565217391305,
+      "grad_norm": 0.7961319088935852,
+      "learning_rate": 9.997984225774238e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5559745967388153,
+      "num_tokens": 696935.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.5280181407928466,
+      "epoch": 0.5978260869565217,
+      "grad_norm": 0.8740176558494568,
+      "learning_rate": 9.99744165056549e-05,
+      "loss": 1.2197,
+      "mean_token_accuracy": 0.5634395360946656,
+      "num_tokens": 710020.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.5327624678611755,
+      "epoch": 0.6086956521739131,
+      "grad_norm": 0.8462045192718506,
+      "learning_rate": 9.99683450634423e-05,
+      "loss": 1.2192,
+      "mean_token_accuracy": 0.5612434148788452,
+      "num_tokens": 723303.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.5094027161598205,
+      "epoch": 0.6195652173913043,
+      "grad_norm": 0.9465392231941223,
+      "learning_rate": 9.996162800955011e-05,
+      "loss": 1.1817,
+      "mean_token_accuracy": 0.5782815992832184,
+      "num_tokens": 735527.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.5501951813697814,
+      "epoch": 0.6304347826086957,
+      "grad_norm": 0.7445736527442932,
+      "learning_rate": 9.995426543076545e-05,
+      "loss": 1.2452,
+      "mean_token_accuracy": 0.5505437403917313,
+      "num_tokens": 748455.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.5227739214897156,
+      "epoch": 0.6413043478260869,
+      "grad_norm": 0.8378339409828186,
+      "learning_rate": 9.994625742221586e-05,
+      "loss": 1.2551,
+      "mean_token_accuracy": 0.5548771649599076,
+      "num_tokens": 761420.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.5428736090660096,
+      "epoch": 0.6521739130434783,
+      "grad_norm": 0.9249877333641052,
+      "learning_rate": 9.993760408736814e-05,
+      "loss": 1.282,
+      "mean_token_accuracy": 0.5393997848033905,
+      "num_tokens": 773676.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.5630847930908203,
+      "epoch": 0.6630434782608695,
+      "grad_norm": 0.8152625560760498,
+      "learning_rate": 9.992830553802696e-05,
+      "loss": 1.2763,
+      "mean_token_accuracy": 0.5402287989854813,
+      "num_tokens": 786757.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.532595145702362,
+      "epoch": 0.6739130434782609,
+      "grad_norm": 0.7313966751098633,
+      "learning_rate": 9.991836189433342e-05,
+      "loss": 1.2323,
+      "mean_token_accuracy": 0.5645015567541123,
+      "num_tokens": 799851.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.5160420179367065,
+      "epoch": 0.6847826086956522,
+      "grad_norm": 0.7158486843109131,
+      "learning_rate": 9.990777328476348e-05,
+      "loss": 1.2021,
+      "mean_token_accuracy": 0.555733984708786,
+      "num_tokens": 812648.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.5084859609603882,
+      "epoch": 0.6956521739130435,
+      "grad_norm": 0.7056333422660828,
+      "learning_rate": 9.98965398461264e-05,
+      "loss": 1.176,
+      "mean_token_accuracy": 0.5772889316082,
+      "num_tokens": 825054.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.5172175765037537,
+      "epoch": 0.7065217391304348,
+      "grad_norm": 0.8173061013221741,
+      "learning_rate": 9.988466172356282e-05,
+      "loss": 1.1893,
+      "mean_token_accuracy": 0.5774871349334717,
+      "num_tokens": 838148.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.5155822157859802,
+      "epoch": 0.717391304347826,
+      "grad_norm": 0.7483378648757935,
+      "learning_rate": 9.9872139070543e-05,
+      "loss": 1.2377,
+      "mean_token_accuracy": 0.5529649972915649,
+      "num_tokens": 851079.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.5392202258110046,
+      "epoch": 0.7282608695652174,
+      "grad_norm": 0.8020080924034119,
+      "learning_rate": 9.985897204886481e-05,
+      "loss": 1.2471,
+      "mean_token_accuracy": 0.5591055184602738,
+      "num_tokens": 863673.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.520468044281006,
+      "epoch": 0.7391304347826086,
+      "grad_norm": 0.7957432866096497,
+      "learning_rate": 9.984516082865159e-05,
+      "loss": 1.2582,
+      "mean_token_accuracy": 0.5404952645301819,
+      "num_tokens": 876764.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.5024335741996766,
+      "epoch": 0.75,
+      "grad_norm": 0.8745436072349548,
+      "learning_rate": 9.983070558835002e-05,
+      "loss": 1.2029,
+      "mean_token_accuracy": 0.5673643052577972,
+      "num_tokens": 889851.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.5227225780487061,
+      "epoch": 0.7608695652173914,
+      "grad_norm": 0.7866286039352417,
+      "learning_rate": 9.981560651472781e-05,
+      "loss": 1.2597,
+      "mean_token_accuracy": 0.5447615504264831,
+      "num_tokens": 903182.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.5330517888069153,
+      "epoch": 0.7717391304347826,
+      "grad_norm": 0.696030855178833,
+      "learning_rate": 9.97998638028712e-05,
+      "loss": 1.2417,
+      "mean_token_accuracy": 0.5569504171609878,
+      "num_tokens": 916564.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.5023605585098267,
+      "epoch": 0.782608695652174,
+      "grad_norm": 0.7980480194091797,
+      "learning_rate": 9.978347765618257e-05,
+      "loss": 1.2073,
+      "mean_token_accuracy": 0.562690931558609,
+      "num_tokens": 929820.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.5466702461242676,
+      "epoch": 0.7934782608695652,
+      "grad_norm": 0.8441147804260254,
+      "learning_rate": 9.976644828637767e-05,
+      "loss": 1.2859,
+      "mean_token_accuracy": 0.5330282121896743,
+      "num_tokens": 942449.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.515641415119171,
+      "epoch": 0.8043478260869565,
+      "grad_norm": 0.8833957314491272,
+      "learning_rate": 9.974877591348304e-05,
+      "loss": 1.2627,
+      "mean_token_accuracy": 0.5418030679225921,
+      "num_tokens": 955620.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.5292868852615356,
+      "epoch": 0.8152173913043478,
+      "grad_norm": 0.8666150569915771,
+      "learning_rate": 9.973046076583301e-05,
+      "loss": 1.2364,
+      "mean_token_accuracy": 0.5494832009077072,
+      "num_tokens": 968954.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.5135694026947022,
+      "epoch": 0.8260869565217391,
+      "grad_norm": 0.9172241687774658,
+      "learning_rate": 9.97115030800669e-05,
+      "loss": 1.2053,
+      "mean_token_accuracy": 0.5607668071985245,
+      "num_tokens": 981323.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.5214222908020019,
+      "epoch": 0.8369565217391305,
+      "grad_norm": 0.9353718161582947,
+      "learning_rate": 9.969190310112579e-05,
+      "loss": 1.225,
+      "mean_token_accuracy": 0.5599299073219299,
+      "num_tokens": 994834.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.5419185280799865,
+      "epoch": 0.8478260869565217,
+      "grad_norm": 0.717232882976532,
+      "learning_rate": 9.967166108224957e-05,
+      "loss": 1.2848,
+      "mean_token_accuracy": 0.5360999226570129,
+      "num_tokens": 1007806.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.545407807826996,
+      "epoch": 0.8586956521739131,
+      "grad_norm": 0.745928943157196,
+      "learning_rate": 9.965077728497348e-05,
+      "loss": 1.2683,
+      "mean_token_accuracy": 0.5427737534046173,
+      "num_tokens": 1021093.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.5416621446609498,
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.8545331954956055,
+      "learning_rate": 9.96292519791248e-05,
+      "loss": 1.3036,
+      "mean_token_accuracy": 0.5352708637714386,
+      "num_tokens": 1034317.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.516196882724762,
+      "epoch": 0.8804347826086957,
+      "grad_norm": 0.8239868879318237,
+      "learning_rate": 9.96070854428194e-05,
+      "loss": 1.1943,
+      "mean_token_accuracy": 0.568702632188797,
+      "num_tokens": 1047679.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.5478002548217773,
+      "epoch": 0.8913043478260869,
+      "grad_norm": 0.9187906980514526,
+      "learning_rate": 9.958427796245808e-05,
+      "loss": 1.2707,
+      "mean_token_accuracy": 0.5460701882839203,
+      "num_tokens": 1060840.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.540980589389801,
+      "epoch": 0.9021739130434783,
+      "grad_norm": 0.774869978427887,
+      "learning_rate": 9.956082983272293e-05,
+      "loss": 1.2397,
+      "mean_token_accuracy": 0.5464379012584686,
+      "num_tokens": 1073529.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.5131322622299195,
+      "epoch": 0.9130434782608695,
+      "grad_norm": 1.029721975326538,
+      "learning_rate": 9.953674135657345e-05,
+      "loss": 1.2198,
+      "mean_token_accuracy": 0.5641603857278824,
+      "num_tokens": 1086600.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.5259755611419679,
+      "epoch": 0.9239130434782609,
+      "grad_norm": 0.8057295083999634,
+      "learning_rate": 9.951201284524275e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5562368750572204,
+      "num_tokens": 1099737.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.5159748077392579,
+      "epoch": 0.9347826086956522,
+      "grad_norm": 0.6001420617103577,
+      "learning_rate": 9.94866446182334e-05,
+      "loss": 1.2524,
+      "mean_token_accuracy": 0.5458084315061569,
+      "num_tokens": 1112239.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.524741494655609,
+      "epoch": 0.9456521739130435,
+      "grad_norm": 0.847523033618927,
+      "learning_rate": 9.94606370033134e-05,
+      "loss": 1.2245,
+      "mean_token_accuracy": 0.5601188719272614,
+      "num_tokens": 1125191.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.5371912598609925,
+      "epoch": 0.9565217391304348,
+      "grad_norm": 0.767745852470398,
+      "learning_rate": 9.943399033651189e-05,
+      "loss": 1.2319,
+      "mean_token_accuracy": 0.5546965420246124,
+      "num_tokens": 1138077.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.5223184943199157,
+      "epoch": 0.967391304347826,
+      "grad_norm": 0.9313151836395264,
+      "learning_rate": 9.94067049621148e-05,
+      "loss": 1.2237,
+      "mean_token_accuracy": 0.5578917026519775,
+      "num_tokens": 1151364.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.530699372291565,
+      "epoch": 0.9782608695652174,
+      "grad_norm": 0.7053420543670654,
+      "learning_rate": 9.937878123266044e-05,
+      "loss": 1.2269,
+      "mean_token_accuracy": 0.5488695651292801,
+      "num_tokens": 1164326.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.5197353124618531,
+      "epoch": 0.9891304347826086,
+      "grad_norm": 0.9986150860786438,
+      "learning_rate": 9.9350219508935e-05,
+      "loss": 1.2106,
+      "mean_token_accuracy": 0.5582584798336029,
+      "num_tokens": 1176914.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.5376808762550354,
+      "epoch": 1.0,
+      "grad_norm": 0.7129160165786743,
+      "learning_rate": 9.93210201599677e-05,
+      "loss": 1.2377,
+      "mean_token_accuracy": 0.557282817363739,
+      "num_tokens": 1189994.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.5522505402565003,
+      "epoch": 1.0108695652173914,
+      "grad_norm": 0.9139987230300903,
+      "learning_rate": 9.929118356302621e-05,
+      "loss": 1.2492,
+      "mean_token_accuracy": 0.5444983661174774,
+      "num_tokens": 1202961.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.5519829273223877,
+      "epoch": 1.0217391304347827,
+      "grad_norm": 1.0422664880752563,
+      "learning_rate": 9.926071010361173e-05,
+      "loss": 1.1957,
+      "mean_token_accuracy": 0.5779279708862305,
+      "num_tokens": 1215901.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.5434082865715026,
+      "epoch": 1.0326086956521738,
+      "grad_norm": 1.0472567081451416,
+      "learning_rate": 9.922960017545395e-05,
+      "loss": 1.2263,
+      "mean_token_accuracy": 0.5640866041183472,
+      "num_tokens": 1228567.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.5352994203567505,
+      "epoch": 1.0434782608695652,
+      "grad_norm": 1.0810585021972656,
+      "learning_rate": 9.919785418050598e-05,
+      "loss": 1.1876,
+      "mean_token_accuracy": 0.5709751307964325,
+      "num_tokens": 1241529.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.4996863842010497,
+      "epoch": 1.0543478260869565,
+      "grad_norm": 1.1204661130905151,
+      "learning_rate": 9.916547252893923e-05,
+      "loss": 1.1354,
+      "mean_token_accuracy": 0.5961336076259613,
+      "num_tokens": 1254137.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.5315279841423035,
+      "epoch": 1.065217391304348,
+      "grad_norm": 1.0741767883300781,
+      "learning_rate": 9.9132455639138e-05,
+      "loss": 1.1422,
+      "mean_token_accuracy": 0.5875493228435517,
+      "num_tokens": 1266871.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.516059410572052,
+      "epoch": 1.0760869565217392,
+      "grad_norm": 1.1965429782867432,
+      "learning_rate": 9.90988039376942e-05,
+      "loss": 1.1438,
+      "mean_token_accuracy": 0.5906685352325439,
+      "num_tokens": 1279655.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.5148675918579102,
+      "epoch": 1.0869565217391304,
+      "grad_norm": 1.1992353200912476,
+      "learning_rate": 9.906451785940167e-05,
+      "loss": 1.1636,
+      "mean_token_accuracy": 0.5710582077503205,
+      "num_tokens": 1292202.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.5187682271003724,
+      "epoch": 1.0978260869565217,
+      "grad_norm": 1.0606764554977417,
+      "learning_rate": 9.902959784725077e-05,
+      "loss": 1.1763,
+      "mean_token_accuracy": 0.5760969400405884,
+      "num_tokens": 1305284.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.5265469312667848,
+      "epoch": 1.108695652173913,
+      "grad_norm": 1.02944815158844,
+      "learning_rate": 9.899404435242246e-05,
+      "loss": 1.2096,
+      "mean_token_accuracy": 0.5624277234077454,
+      "num_tokens": 1318408.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.5455774545669556,
+      "epoch": 1.1195652173913044,
+      "grad_norm": 1.1493759155273438,
+      "learning_rate": 9.895785783428262e-05,
+      "loss": 1.1652,
+      "mean_token_accuracy": 0.5867336988449097,
+      "num_tokens": 1331156.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.5371973156929015,
+      "epoch": 1.1304347826086956,
+      "grad_norm": 0.9468239545822144,
+      "learning_rate": 9.8921038760376e-05,
+      "loss": 1.2371,
+      "mean_token_accuracy": 0.5544474184513092,
+      "num_tokens": 1343904.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.5403631925582886,
+      "epoch": 1.141304347826087,
+      "grad_norm": 1.1717609167099,
+      "learning_rate": 9.888358760642029e-05,
+      "loss": 1.1394,
+      "mean_token_accuracy": 0.5933512449264526,
+      "num_tokens": 1356797.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.5518387794494628,
+      "epoch": 1.1521739130434783,
+      "grad_norm": 1.2024801969528198,
+      "learning_rate": 9.884550485629987e-05,
+      "loss": 1.2065,
+      "mean_token_accuracy": 0.5667118012905121,
+      "num_tokens": 1369690.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.5736596703529357,
+      "epoch": 1.1630434782608696,
+      "grad_norm": 1.0323596000671387,
+      "learning_rate": 9.88067910020596e-05,
+      "loss": 1.2124,
+      "mean_token_accuracy": 0.5691272497177124,
+      "num_tokens": 1382561.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.57814359664917,
+      "epoch": 1.1739130434782608,
+      "grad_norm": 1.1128944158554077,
+      "learning_rate": 9.876744654389854e-05,
+      "loss": 1.2319,
+      "mean_token_accuracy": 0.554848113656044,
+      "num_tokens": 1395409.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.5651036262512208,
+      "epoch": 1.184782608695652,
+      "grad_norm": 1.1131497621536255,
+      "learning_rate": 9.872747199016328e-05,
+      "loss": 1.1995,
+      "mean_token_accuracy": 0.5680587291717529,
+      "num_tokens": 1408511.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.519801914691925,
+      "epoch": 1.1956521739130435,
+      "grad_norm": 0.8381641507148743,
+      "learning_rate": 9.868686785734165e-05,
+      "loss": 1.1729,
+      "mean_token_accuracy": 0.5780038118362427,
+      "num_tokens": 1421328.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.5411308765411378,
+      "epoch": 1.2065217391304348,
+      "grad_norm": 1.1784008741378784,
+      "learning_rate": 9.86456346700558e-05,
+      "loss": 1.2026,
+      "mean_token_accuracy": 0.5581619143486023,
+      "num_tokens": 1434644.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.524932038784027,
+      "epoch": 1.2173913043478262,
+      "grad_norm": 0.9289618730545044,
+      "learning_rate": 9.860377296105556e-05,
+      "loss": 1.219,
+      "mean_token_accuracy": 0.557993471622467,
+      "num_tokens": 1447469.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.5029574513435364,
+      "epoch": 1.2282608695652173,
+      "grad_norm": 1.0168135166168213,
+      "learning_rate": 9.856128327121155e-05,
+      "loss": 1.1589,
+      "mean_token_accuracy": 0.578672569990158,
+      "num_tokens": 1460202.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.5095925211906434,
+      "epoch": 1.2391304347826086,
+      "grad_norm": 1.052454948425293,
+      "learning_rate": 9.85181661495081e-05,
+      "loss": 1.2232,
+      "mean_token_accuracy": 0.5522898703813552,
+      "num_tokens": 1473114.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.5059074401855468,
+      "epoch": 1.25,
+      "grad_norm": 1.20883309841156,
+      "learning_rate": 9.847442215303626e-05,
+      "loss": 1.2172,
+      "mean_token_accuracy": 0.5659465253353119,
+      "num_tokens": 1485990.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.494919514656067,
+      "epoch": 1.2608695652173914,
+      "grad_norm": 1.1653634309768677,
+      "learning_rate": 9.843005184698655e-05,
+      "loss": 1.1817,
+      "mean_token_accuracy": 0.5764101088047028,
+      "num_tokens": 1498939.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.5184181690216065,
+      "epoch": 1.2717391304347827,
+      "grad_norm": 1.1174242496490479,
+      "learning_rate": 9.838505580464168e-05,
+      "loss": 1.1976,
+      "mean_token_accuracy": 0.5707351744174958,
+      "num_tokens": 1511943.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.5217233657836915,
+      "epoch": 1.2826086956521738,
+      "grad_norm": 1.0029795169830322,
+      "learning_rate": 9.833943460736912e-05,
+      "loss": 1.2296,
+      "mean_token_accuracy": 0.5572409898042678,
+      "num_tokens": 1525135.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.514461922645569,
+      "epoch": 1.2934782608695652,
+      "grad_norm": 1.2473056316375732,
+      "learning_rate": 9.829318884461359e-05,
+      "loss": 1.221,
+      "mean_token_accuracy": 0.5566778779029846,
+      "num_tokens": 1537699.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.5298507332801818,
+      "epoch": 1.3043478260869565,
+      "grad_norm": 1.068049430847168,
+      "learning_rate": 9.824631911388948e-05,
+      "loss": 1.248,
+      "mean_token_accuracy": 0.5430671572685242,
+      "num_tokens": 1550938.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.5463980197906495,
+      "epoch": 1.315217391304348,
+      "grad_norm": 1.0760388374328613,
+      "learning_rate": 9.819882602077309e-05,
+      "loss": 1.2825,
+      "mean_token_accuracy": 0.5330462247133255,
+      "num_tokens": 1563597.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.5457575082778932,
+      "epoch": 1.3260869565217392,
+      "grad_norm": 1.1161272525787354,
+      "learning_rate": 9.815071017889482e-05,
+      "loss": 1.2598,
+      "mean_token_accuracy": 0.543943053483963,
+      "num_tokens": 1576301.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.5349620819091796,
+      "epoch": 1.3369565217391304,
+      "grad_norm": 1.1779778003692627,
+      "learning_rate": 9.810197220993123e-05,
+      "loss": 1.2551,
+      "mean_token_accuracy": 0.5386941403150558,
+      "num_tokens": 1589776.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.5158817052841187,
+      "epoch": 1.3478260869565217,
+      "grad_norm": 1.1150175333023071,
+      "learning_rate": 9.805261274359705e-05,
+      "loss": 1.193,
+      "mean_token_accuracy": 0.5642519950866699,
+      "num_tokens": 1602239.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.512274718284607,
+      "epoch": 1.358695652173913,
+      "grad_norm": 0.9392043948173523,
+      "learning_rate": 9.800263241763698e-05,
+      "loss": 1.2334,
+      "mean_token_accuracy": 0.5577278465032578,
+      "num_tokens": 1615621.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.5087523460388184,
+      "epoch": 1.3695652173913042,
+      "grad_norm": 0.9521236419677734,
+      "learning_rate": 9.795203187781751e-05,
+      "loss": 1.1651,
+      "mean_token_accuracy": 0.5836262464523315,
+      "num_tokens": 1628741.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.5212602257728576,
+      "epoch": 1.3804347826086958,
+      "grad_norm": 0.9689566493034363,
+      "learning_rate": 9.790081177791852e-05,
+      "loss": 1.1944,
+      "mean_token_accuracy": 0.572248637676239,
+      "num_tokens": 1641646.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.521955931186676,
+      "epoch": 1.391304347826087,
+      "grad_norm": 1.016711711883545,
+      "learning_rate": 9.784897277972491e-05,
+      "loss": 1.2105,
+      "mean_token_accuracy": 0.5605559885501862,
+      "num_tokens": 1654499.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.5126453638076782,
+      "epoch": 1.4021739130434783,
+      "grad_norm": 1.1951313018798828,
+      "learning_rate": 9.779651555301794e-05,
+      "loss": 1.2305,
+      "mean_token_accuracy": 0.5537042915821075,
+      "num_tokens": 1667748.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.528828752040863,
+      "epoch": 1.4130434782608696,
+      "grad_norm": 1.1385231018066406,
+      "learning_rate": 9.77434407755667e-05,
+      "loss": 1.2294,
+      "mean_token_accuracy": 0.554050150513649,
+      "num_tokens": 1681184.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.510583758354187,
+      "epoch": 1.4239130434782608,
+      "grad_norm": 1.0576328039169312,
+      "learning_rate": 9.768974913311922e-05,
+      "loss": 1.2674,
+      "mean_token_accuracy": 0.5414516568183899,
+      "num_tokens": 1693818.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.5150775551795959,
+      "epoch": 1.434782608695652,
+      "grad_norm": 1.3364728689193726,
+      "learning_rate": 9.763544131939374e-05,
+      "loss": 1.2075,
+      "mean_token_accuracy": 0.559964632987976,
+      "num_tokens": 1706939.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.5151704668998718,
+      "epoch": 1.4456521739130435,
+      "grad_norm": 1.02871835231781,
+      "learning_rate": 9.758051803606971e-05,
+      "loss": 1.2487,
+      "mean_token_accuracy": 0.552227908372879,
+      "num_tokens": 1719315.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.5152636528015138,
+      "epoch": 1.4565217391304348,
+      "grad_norm": 1.0097824335098267,
+      "learning_rate": 9.75249799927786e-05,
+      "loss": 1.2263,
+      "mean_token_accuracy": 0.5533849179744721,
+      "num_tokens": 1731891.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.512537384033203,
+      "epoch": 1.4673913043478262,
+      "grad_norm": 1.2632033824920654,
+      "learning_rate": 9.746882790709491e-05,
+      "loss": 1.222,
+      "mean_token_accuracy": 0.5614925265312195,
+      "num_tokens": 1744427.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.5295302748680115,
+      "epoch": 1.4782608695652173,
+      "grad_norm": 1.113368034362793,
+      "learning_rate": 9.741206250452683e-05,
+      "loss": 1.2735,
+      "mean_token_accuracy": 0.539223712682724,
+      "num_tokens": 1757083.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.536200964450836,
+      "epoch": 1.4891304347826086,
+      "grad_norm": 1.1522810459136963,
+      "learning_rate": 9.735468451850681e-05,
+      "loss": 1.2152,
+      "mean_token_accuracy": 0.565186282992363,
+      "num_tokens": 1769982.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.495800745487213,
+      "epoch": 1.5,
+      "grad_norm": 1.2632598876953125,
+      "learning_rate": 9.729669469038216e-05,
+      "loss": 1.1635,
+      "mean_token_accuracy": 0.5871178984642029,
+      "num_tokens": 1783102.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.535517191886902,
+      "epoch": 1.5108695652173914,
+      "grad_norm": 0.9593290090560913,
+      "learning_rate": 9.723809376940544e-05,
+      "loss": 1.2108,
+      "mean_token_accuracy": 0.5709479689598084,
+      "num_tokens": 1796398.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.529611337184906,
+      "epoch": 1.5217391304347827,
+      "grad_norm": 1.0819748640060425,
+      "learning_rate": 9.717888251272477e-05,
+      "loss": 1.1972,
+      "mean_token_accuracy": 0.5633429378271103,
+      "num_tokens": 1809379.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.5493282318115233,
+      "epoch": 1.5326086956521738,
+      "grad_norm": 0.9472999572753906,
+      "learning_rate": 9.71190616853741e-05,
+      "loss": 1.2616,
+      "mean_token_accuracy": 0.5486618399620056,
+      "num_tokens": 1822664.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.4989375710487365,
+      "epoch": 1.5434782608695652,
+      "grad_norm": 1.2883214950561523,
+      "learning_rate": 9.705863206026321e-05,
+      "loss": 1.2137,
+      "mean_token_accuracy": 0.558601850271225,
+      "num_tokens": 1835336.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.5061516761779785,
+      "epoch": 1.5543478260869565,
+      "grad_norm": 0.9577755928039551,
+      "learning_rate": 9.699759441816787e-05,
+      "loss": 1.1739,
+      "mean_token_accuracy": 0.577557110786438,
+      "num_tokens": 1847755.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.5141437649726868,
+      "epoch": 1.5652173913043477,
+      "grad_norm": 1.0751005411148071,
+      "learning_rate": 9.693594954771965e-05,
+      "loss": 1.231,
+      "mean_token_accuracy": 0.5506497710943222,
+      "num_tokens": 1860302.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.5419356107711792,
+      "epoch": 1.5760869565217392,
+      "grad_norm": 1.0141667127609253,
+      "learning_rate": 9.687369824539577e-05,
+      "loss": 1.2788,
+      "mean_token_accuracy": 0.5303231775760651,
+      "num_tokens": 1873093.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.520876133441925,
+      "epoch": 1.5869565217391304,
+      "grad_norm": 1.109215259552002,
+      "learning_rate": 9.68108413155088e-05,
+      "loss": 1.2333,
+      "mean_token_accuracy": 0.5601014912128448,
+      "num_tokens": 1886177.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.4981224894523621,
+      "epoch": 1.5978260869565217,
+      "grad_norm": 0.9200493097305298,
+      "learning_rate": 9.674737957019624e-05,
+      "loss": 1.1852,
+      "mean_token_accuracy": 0.5700576066970825,
+      "num_tokens": 1899113.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.5140800833702088,
+      "epoch": 1.608695652173913,
+      "grad_norm": 1.190007209777832,
+      "learning_rate": 9.66833138294101e-05,
+      "loss": 1.1929,
+      "mean_token_accuracy": 0.5691904962062836,
+      "num_tokens": 1912474.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.5299779295921325,
+      "epoch": 1.6195652173913042,
+      "grad_norm": 0.9787003397941589,
+      "learning_rate": 9.661864492090625e-05,
+      "loss": 1.2179,
+      "mean_token_accuracy": 0.553766930103302,
+      "num_tokens": 1925685.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.5431510925292968,
+      "epoch": 1.6304347826086958,
+      "grad_norm": 1.1734333038330078,
+      "learning_rate": 9.655337368023371e-05,
+      "loss": 1.2108,
+      "mean_token_accuracy": 0.5539384454488754,
+      "num_tokens": 1938610.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.5246233105659486,
+      "epoch": 1.641304347826087,
+      "grad_norm": 1.072691559791565,
+      "learning_rate": 9.64875009507239e-05,
+      "loss": 1.1999,
+      "mean_token_accuracy": 0.5761029601097107,
+      "num_tokens": 1951241.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.538881742954254,
+      "epoch": 1.6521739130434783,
+      "grad_norm": 1.0783456563949585,
+      "learning_rate": 9.642102758347973e-05,
+      "loss": 1.2443,
+      "mean_token_accuracy": 0.5502734839916229,
+      "num_tokens": 1964816.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.550068199634552,
+      "epoch": 1.6630434782608696,
+      "grad_norm": 1.0582056045532227,
+      "learning_rate": 9.63539544373646e-05,
+      "loss": 1.2182,
+      "mean_token_accuracy": 0.5598388969898224,
+      "num_tokens": 1977930.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.5344447016716003,
+      "epoch": 1.6739130434782608,
+      "grad_norm": 0.9788505434989929,
+      "learning_rate": 9.628628237899126e-05,
+      "loss": 1.1852,
+      "mean_token_accuracy": 0.5595145970582962,
+      "num_tokens": 1991032.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.5468374967575074,
+      "epoch": 1.6847826086956523,
+      "grad_norm": 1.0464048385620117,
+      "learning_rate": 9.621801228271073e-05,
+      "loss": 1.2175,
+      "mean_token_accuracy": 0.5616866886615753,
+      "num_tokens": 2004207.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.5422045588493347,
+      "epoch": 1.6956521739130435,
+      "grad_norm": 0.8307158946990967,
+      "learning_rate": 9.614914503060083e-05,
+      "loss": 1.2202,
+      "mean_token_accuracy": 0.5515525698661804,
+      "num_tokens": 2016969.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.5343055129051208,
+      "epoch": 1.7065217391304348,
+      "grad_norm": 1.198614239692688,
+      "learning_rate": 9.607968151245498e-05,
+      "loss": 1.1866,
+      "mean_token_accuracy": 0.5771215856075287,
+      "num_tokens": 2029750.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.5321205615997315,
+      "epoch": 1.7173913043478262,
+      "grad_norm": 0.9247676134109497,
+      "learning_rate": 9.600962262577053e-05,
+      "loss": 1.2205,
+      "mean_token_accuracy": 0.5626431256532669,
+      "num_tokens": 2043181.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.541359269618988,
+      "epoch": 1.7282608695652173,
+      "grad_norm": 1.0934436321258545,
+      "learning_rate": 9.593896927573728e-05,
+      "loss": 1.2397,
+      "mean_token_accuracy": 0.5406488478183746,
+      "num_tokens": 2056541.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.5094799280166626,
+      "epoch": 1.7391304347826086,
+      "grad_norm": 0.8803229928016663,
+      "learning_rate": 9.586772237522573e-05,
+      "loss": 1.2047,
+      "mean_token_accuracy": 0.5659328937530518,
+      "num_tokens": 2069752.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.4946965217590331,
+      "epoch": 1.75,
+      "grad_norm": 1.0182883739471436,
+      "learning_rate": 9.579588284477526e-05,
+      "loss": 1.1492,
+      "mean_token_accuracy": 0.5829819083213806,
+      "num_tokens": 2083119.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.483540380001068,
+      "epoch": 1.7608695652173914,
+      "grad_norm": 1.2263387441635132,
+      "learning_rate": 9.572345161258235e-05,
+      "loss": 1.1474,
+      "mean_token_accuracy": 0.5895151972770691,
+      "num_tokens": 2095862.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.5072382926940917,
+      "epoch": 1.7717391304347827,
+      "grad_norm": 0.8639858365058899,
+      "learning_rate": 9.565042961448844e-05,
+      "loss": 1.1997,
+      "mean_token_accuracy": 0.5625985980033874,
+      "num_tokens": 2108549.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.5286765098571777,
+      "epoch": 1.7826086956521738,
+      "grad_norm": 1.0652116537094116,
+      "learning_rate": 9.557681779396797e-05,
+      "loss": 1.2253,
+      "mean_token_accuracy": 0.5569576025009155,
+      "num_tokens": 2120871.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.5006824493408204,
+      "epoch": 1.7934782608695652,
+      "grad_norm": 0.949942946434021,
+      "learning_rate": 9.550261710211608e-05,
+      "loss": 1.1973,
+      "mean_token_accuracy": 0.5634852379560471,
+      "num_tokens": 2134097.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.5050195574760437,
+      "epoch": 1.8043478260869565,
+      "grad_norm": 1.016350507736206,
+      "learning_rate": 9.542782849763637e-05,
+      "loss": 1.1709,
+      "mean_token_accuracy": 0.5780033886432647,
+      "num_tokens": 2147811.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.5177413702011109,
+      "epoch": 1.8152173913043477,
+      "grad_norm": 1.264799952507019,
+      "learning_rate": 9.535245294682857e-05,
+      "loss": 1.2513,
+      "mean_token_accuracy": 0.5521585702896118,
+      "num_tokens": 2160506.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.5374733328819274,
+      "epoch": 1.8260869565217392,
+      "grad_norm": 1.1713751554489136,
+      "learning_rate": 9.527649142357596e-05,
+      "loss": 1.2708,
+      "mean_token_accuracy": 0.5314113944768906,
+      "num_tokens": 2173328.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.5083181142807007,
+      "epoch": 1.8369565217391304,
+      "grad_norm": 1.1553071737289429,
+      "learning_rate": 9.519994490933279e-05,
+      "loss": 1.206,
+      "mean_token_accuracy": 0.5680734992027283,
+      "num_tokens": 2186452.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.5291340470314025,
+      "epoch": 1.8478260869565217,
+      "grad_norm": 1.1443442106246948,
+      "learning_rate": 9.51228143931117e-05,
+      "loss": 1.2351,
+      "mean_token_accuracy": 0.5539528131484985,
+      "num_tokens": 2199594.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.5204999327659607,
+      "epoch": 1.858695652173913,
+      "grad_norm": 1.1584019660949707,
+      "learning_rate": 9.504510087147088e-05,
+      "loss": 1.2338,
+      "mean_token_accuracy": 0.5519226849079132,
+      "num_tokens": 2212135.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.5614403247833253,
+      "epoch": 1.8695652173913042,
+      "grad_norm": 1.0798224210739136,
+      "learning_rate": 9.496680534850113e-05,
+      "loss": 1.2534,
+      "mean_token_accuracy": 0.5530328571796417,
+      "num_tokens": 2225159.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.5276212096214294,
+      "epoch": 1.8804347826086958,
+      "grad_norm": 1.1296766996383667,
+      "learning_rate": 9.488792883581299e-05,
+      "loss": 1.1784,
+      "mean_token_accuracy": 0.5774711936712265,
+      "num_tokens": 2238139.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.544056522846222,
+      "epoch": 1.891304347826087,
+      "grad_norm": 1.1214172840118408,
+      "learning_rate": 9.480847235252361e-05,
+      "loss": 1.2268,
+      "mean_token_accuracy": 0.5613886952400208,
+      "num_tokens": 2250928.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.5295695900917052,
+      "epoch": 1.9021739130434783,
+      "grad_norm": 1.1650352478027344,
+      "learning_rate": 9.472843692524363e-05,
+      "loss": 1.1573,
+      "mean_token_accuracy": 0.5787465155124665,
+      "num_tokens": 2263338.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.5347764611244201,
+      "epoch": 1.9130434782608696,
+      "grad_norm": 1.0249896049499512,
+      "learning_rate": 9.464782358806383e-05,
+      "loss": 1.1731,
+      "mean_token_accuracy": 0.5780636668205261,
+      "num_tokens": 2276200.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.5715635061264037,
+      "epoch": 1.9239130434782608,
+      "grad_norm": 1.0768051147460938,
+      "learning_rate": 9.45666333825419e-05,
+      "loss": 1.2585,
+      "mean_token_accuracy": 0.5452336609363556,
+      "num_tokens": 2289088.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.5402274131774902,
+      "epoch": 1.9347826086956523,
+      "grad_norm": 1.0846654176712036,
+      "learning_rate": 9.448486735768884e-05,
+      "loss": 1.1918,
+      "mean_token_accuracy": 0.5699589729309082,
+      "num_tokens": 2302544.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.5048401594161986,
+      "epoch": 1.9456521739130435,
+      "grad_norm": 1.1533433198928833,
+      "learning_rate": 9.440252656995551e-05,
+      "loss": 1.1792,
+      "mean_token_accuracy": 0.5685461640357972,
+      "num_tokens": 2315473.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.5128441214561463,
+      "epoch": 1.9565217391304348,
+      "grad_norm": 1.2847894430160522,
+      "learning_rate": 9.431961208321892e-05,
+      "loss": 1.1566,
+      "mean_token_accuracy": 0.5870453357696533,
+      "num_tokens": 2329176.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.5362990856170655,
+      "epoch": 1.9673913043478262,
+      "grad_norm": 1.2497868537902832,
+      "learning_rate": 9.423612496876855e-05,
+      "loss": 1.1896,
+      "mean_token_accuracy": 0.5719706892967225,
+      "num_tokens": 2341591.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.5580734014511108,
+      "epoch": 1.9782608695652173,
+      "grad_norm": 1.1140056848526,
+      "learning_rate": 9.415206630529241e-05,
+      "loss": 1.2434,
+      "mean_token_accuracy": 0.5461874425411224,
+      "num_tokens": 2354577.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.5499179720878602,
+      "epoch": 1.9891304347826086,
+      "grad_norm": 1.0708650350570679,
+      "learning_rate": 9.406743717886321e-05,
+      "loss": 1.1635,
+      "mean_token_accuracy": 0.5835445284843445,
+      "num_tokens": 2366934.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.5282660722732544,
+      "epoch": 2.0,
+      "grad_norm": 0.9982873797416687,
+      "learning_rate": 9.398223868292424e-05,
+      "loss": 1.162,
+      "mean_token_accuracy": 0.5795026063919068,
+      "num_tokens": 2379988.0,
+      "step": 1840
+    },
+    {
+      "entropy": 1.5200519680976867,
+      "epoch": 2.010869565217391,
+      "grad_norm": 1.7385492324829102,
+      "learning_rate": 9.389647191827533e-05,
+      "loss": 1.1189,
+      "mean_token_accuracy": 0.593557745218277,
+      "num_tokens": 2393159.0,
+      "step": 1850
+    },
+    {
+      "entropy": 1.4853432416915893,
+      "epoch": 2.0217391304347827,
+      "grad_norm": 1.50752592086792,
+      "learning_rate": 9.38101379930585e-05,
+      "loss": 1.1213,
+      "mean_token_accuracy": 0.5929610729217529,
+      "num_tokens": 2405857.0,
+      "step": 1860
+    },
+    {
+      "entropy": 1.4830349206924438,
+      "epoch": 2.032608695652174,
+      "grad_norm": 1.863458275794983,
+      "learning_rate": 9.372323802274379e-05,
+      "loss": 1.054,
+      "mean_token_accuracy": 0.6212123036384583,
+      "num_tokens": 2418899.0,
+      "step": 1870
+    },
+    {
+      "entropy": 1.4546002388000487,
+      "epoch": 2.0434782608695654,
+      "grad_norm": 1.7944133281707764,
+      "learning_rate": 9.363577313011473e-05,
+      "loss": 1.1034,
+      "mean_token_accuracy": 0.6010935366153717,
+      "num_tokens": 2431992.0,
+      "step": 1880
+    },
+    {
+      "entropy": 1.4757711172103882,
+      "epoch": 2.0543478260869565,
+      "grad_norm": 2.174811363220215,
+      "learning_rate": 9.354774444525391e-05,
+      "loss": 1.1084,
+      "mean_token_accuracy": 0.5978758096694946,
+      "num_tokens": 2445070.0,
+      "step": 1890
+    },
+    {
+      "entropy": 1.4131018400192261,
+      "epoch": 2.0652173913043477,
+      "grad_norm": 1.7135498523712158,
+      "learning_rate": 9.345915310552835e-05,
+      "loss": 0.989,
+      "mean_token_accuracy": 0.6438018441200256,
+      "num_tokens": 2458209.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.433064377307892,
+      "epoch": 2.0760869565217392,
+      "grad_norm": 2.0136771202087402,
+      "learning_rate": 9.337000025557476e-05,
+      "loss": 1.0498,
+      "mean_token_accuracy": 0.6281741559505463,
+      "num_tokens": 2471524.0,
+      "step": 1910
+    },
+    {
+      "entropy": 1.4479591965675354,
+      "epoch": 2.0869565217391304,
+      "grad_norm": 1.974313735961914,
+      "learning_rate": 9.328028704728486e-05,
+      "loss": 1.1358,
+      "mean_token_accuracy": 0.5909003794193268,
+      "num_tokens": 2484390.0,
+      "step": 1920
+    },
+    {
+      "entropy": 1.4301373600959777,
+      "epoch": 2.097826086956522,
+      "grad_norm": 2.091122627258301,
+      "learning_rate": 9.319001463979036e-05,
+      "loss": 1.0644,
+      "mean_token_accuracy": 0.6180503129959106,
+      "num_tokens": 2497381.0,
+      "step": 1930
+    },
+    {
+      "entropy": 1.447307538986206,
+      "epoch": 2.108695652173913,
+      "grad_norm": 2.072110414505005,
+      "learning_rate": 9.309918419944812e-05,
+      "loss": 1.0516,
+      "mean_token_accuracy": 0.6275238871574402,
+      "num_tokens": 2510088.0,
+      "step": 1940
+    },
+    {
+      "entropy": 1.4422586917877198,
+      "epoch": 2.119565217391304,
+      "grad_norm": 2.243779182434082,
+      "learning_rate": 9.300779689982498e-05,
+      "loss": 1.0335,
+      "mean_token_accuracy": 0.6258177101612091,
+      "num_tokens": 2522503.0,
+      "step": 1950
+    },
+    {
+      "entropy": 1.4682541370391846,
+      "epoch": 2.130434782608696,
+      "grad_norm": 1.9335227012634277,
+      "learning_rate": 9.291585392168262e-05,
+      "loss": 1.1021,
+      "mean_token_accuracy": 0.5970285713672638,
+      "num_tokens": 2534900.0,
+      "step": 1960
+    },
+    {
+      "entropy": 1.4592154502868653,
+      "epoch": 2.141304347826087,
+      "grad_norm": 1.8887137174606323,
+      "learning_rate": 9.282335645296236e-05,
+      "loss": 1.1029,
+      "mean_token_accuracy": 0.6081736207008361,
+      "num_tokens": 2547520.0,
+      "step": 1970
+    },
+    {
+      "entropy": 1.4663932919502258,
+      "epoch": 2.1521739130434785,
+      "grad_norm": 1.85846745967865,
+      "learning_rate": 9.273030568876972e-05,
+      "loss": 1.1397,
+      "mean_token_accuracy": 0.5843419551849365,
+      "num_tokens": 2561091.0,
+      "step": 1980
+    },
+    {
+      "entropy": 1.4661445379257203,
+      "epoch": 2.1630434782608696,
+      "grad_norm": 1.8708477020263672,
+      "learning_rate": 9.263670283135908e-05,
+      "loss": 1.0669,
+      "mean_token_accuracy": 0.615378075838089,
+      "num_tokens": 2574368.0,
+      "step": 1990
+    },
+    {
+      "entropy": 1.452696180343628,
+      "epoch": 2.1739130434782608,
+      "grad_norm": 2.0021235942840576,
+      "learning_rate": 9.254254909011804e-05,
+      "loss": 1.1013,
+      "mean_token_accuracy": 0.6071189284324646,
+      "num_tokens": 2587038.0,
+      "step": 2000
+    },
+    {
+      "entropy": 1.476554262638092,
+      "epoch": 2.1847826086956523,
+      "grad_norm": 1.845048427581787,
+      "learning_rate": 9.244784568155186e-05,
+      "loss": 1.1526,
+      "mean_token_accuracy": 0.5891152262687683,
+      "num_tokens": 2599681.0,
+      "step": 2010
+    },
+    {
+      "entropy": 1.456975257396698,
+      "epoch": 2.1956521739130435,
+      "grad_norm": 1.8318718671798706,
+      "learning_rate": 9.235259382926775e-05,
+      "loss": 1.0836,
+      "mean_token_accuracy": 0.6162681877613068,
+      "num_tokens": 2612413.0,
+      "step": 2020
+    },
+    {
+      "entropy": 1.4466612815856934,
+      "epoch": 2.2065217391304346,
+      "grad_norm": 1.732918381690979,
+      "learning_rate": 9.225679476395904e-05,
+      "loss": 1.0735,
+      "mean_token_accuracy": 0.6188689291477203,
+      "num_tokens": 2624887.0,
+      "step": 2030
+    },
+    {
+      "entropy": 1.4582988500595093,
+      "epoch": 2.217391304347826,
+      "grad_norm": 1.8891679048538208,
+      "learning_rate": 9.216044972338924e-05,
+      "loss": 1.1246,
+      "mean_token_accuracy": 0.5922350704669952,
+      "num_tokens": 2637800.0,
+      "step": 2040
+    },
+    {
+      "entropy": 1.423206055164337,
+      "epoch": 2.2282608695652173,
+      "grad_norm": 2.4237935543060303,
+      "learning_rate": 9.206355995237614e-05,
+      "loss": 0.997,
+      "mean_token_accuracy": 0.6368430316448211,
+      "num_tokens": 2650494.0,
+      "step": 2050
+    },
+    {
+      "entropy": 1.4386169552803039,
+      "epoch": 2.239130434782609,
+      "grad_norm": 1.8828299045562744,
+      "learning_rate": 9.196612670277561e-05,
+      "loss": 1.1133,
+      "mean_token_accuracy": 0.6033352434635162,
+      "num_tokens": 2663664.0,
+      "step": 2060
+    },
+    {
+      "entropy": 1.47161967754364,
+      "epoch": 2.25,
+      "grad_norm": 2.1654105186462402,
+      "learning_rate": 9.186815123346555e-05,
+      "loss": 1.1308,
+      "mean_token_accuracy": 0.5900616765022277,
+      "num_tokens": 2676431.0,
+      "step": 2070
+    },
+    {
+      "entropy": 1.4096957564353942,
+      "epoch": 2.260869565217391,
+      "grad_norm": 2.163583278656006,
+      "learning_rate": 9.176963481032951e-05,
+      "loss": 1.0434,
+      "mean_token_accuracy": 0.624973613023758,
+      "num_tokens": 2688969.0,
+      "step": 2080
+    },
+    {
+      "entropy": 1.4344560146331786,
+      "epoch": 2.2717391304347827,
+      "grad_norm": 2.001441478729248,
+      "learning_rate": 9.167057870624045e-05,
+      "loss": 1.0799,
+      "mean_token_accuracy": 0.6116856634616852,
+      "num_tokens": 2701940.0,
+      "step": 2090
+    },
+    {
+      "entropy": 1.4405582904815675,
+      "epoch": 2.282608695652174,
+      "grad_norm": 2.0407516956329346,
+      "learning_rate": 9.157098420104416e-05,
+      "loss": 1.1088,
+      "mean_token_accuracy": 0.5969192087650299,
+      "num_tokens": 2714710.0,
+      "step": 2100
+    },
+    {
+      "entropy": 1.4413450241088868,
+      "epoch": 2.2934782608695654,
+      "grad_norm": 2.175715684890747,
+      "learning_rate": 9.147085258154284e-05,
+      "loss": 1.1133,
+      "mean_token_accuracy": 0.597865492105484,
+      "num_tokens": 2728122.0,
+      "step": 2110
+    },
+    {
+      "entropy": 1.4492259979248048,
+      "epoch": 2.3043478260869565,
+      "grad_norm": 1.756759762763977,
+      "learning_rate": 9.137018514147842e-05,
+      "loss": 1.139,
+      "mean_token_accuracy": 0.5876732349395752,
+      "num_tokens": 2741386.0,
+      "step": 2120
+    },
+    {
+      "entropy": 1.4231749057769776,
+      "epoch": 2.3152173913043477,
+      "grad_norm": 2.2174477577209473,
+      "learning_rate": 9.126898318151585e-05,
+      "loss": 1.0647,
+      "mean_token_accuracy": 0.6096932172775269,
+      "num_tokens": 2754105.0,
+      "step": 2130
+    },
+    {
+      "entropy": 1.4355740666389465,
+      "epoch": 2.3260869565217392,
+      "grad_norm": 1.9028793573379517,
+      "learning_rate": 9.116724800922629e-05,
+      "loss": 1.1054,
+      "mean_token_accuracy": 0.6013290226459503,
+      "num_tokens": 2767282.0,
+      "step": 2140
+    },
+    {
+      "entropy": 1.4280303597450257,
+      "epoch": 2.3369565217391304,
+      "grad_norm": 2.2789151668548584,
+      "learning_rate": 9.106498093907024e-05,
+      "loss": 1.0761,
+      "mean_token_accuracy": 0.6177151262760162,
+      "num_tokens": 2779590.0,
+      "step": 2150
+    },
+    {
+      "entropy": 1.4350167870521546,
+      "epoch": 2.3478260869565215,
+      "grad_norm": 1.9398648738861084,
+      "learning_rate": 9.096218329238053e-05,
+      "loss": 1.1292,
+      "mean_token_accuracy": 0.5900428295135498,
+      "num_tokens": 2792004.0,
+      "step": 2160
+    },
+    {
+      "entropy": 1.4391902089118958,
+      "epoch": 2.358695652173913,
+      "grad_norm": 2.298678159713745,
+      "learning_rate": 9.085885639734527e-05,
+      "loss": 1.0603,
+      "mean_token_accuracy": 0.6203874349594116,
+      "num_tokens": 2804852.0,
+      "step": 2170
+    },
+    {
+      "entropy": 1.4266687989234925,
+      "epoch": 2.369565217391304,
+      "grad_norm": 1.8723516464233398,
+      "learning_rate": 9.075500158899067e-05,
+      "loss": 1.0439,
+      "mean_token_accuracy": 0.6204161286354065,
+      "num_tokens": 2818240.0,
+      "step": 2180
+    },
+    {
+      "entropy": 1.447180449962616,
+      "epoch": 2.380434782608696,
+      "grad_norm": 2.6517324447631836,
+      "learning_rate": 9.065062020916377e-05,
+      "loss": 1.0897,
+      "mean_token_accuracy": 0.6031298160552978,
+      "num_tokens": 2831531.0,
+      "step": 2190
+    },
+    {
+      "entropy": 1.45684095621109,
+      "epoch": 2.391304347826087,
+      "grad_norm": 2.0795247554779053,
+      "learning_rate": 9.054571360651517e-05,
+      "loss": 1.0772,
+      "mean_token_accuracy": 0.6159474074840545,
+      "num_tokens": 2844349.0,
+      "step": 2200
+    },
+    {
+      "entropy": 1.4636580228805542,
+      "epoch": 2.4021739130434785,
+      "grad_norm": 2.3823142051696777,
+      "learning_rate": 9.044028313648157e-05,
+      "loss": 1.0985,
+      "mean_token_accuracy": 0.600275206565857,
+      "num_tokens": 2857149.0,
+      "step": 2210
+    },
+    {
+      "entropy": 1.452285599708557,
+      "epoch": 2.4130434782608696,
+      "grad_norm": 2.1175410747528076,
+      "learning_rate": 9.033433016126822e-05,
+      "loss": 1.1088,
+      "mean_token_accuracy": 0.6099293529987335,
+      "num_tokens": 2869994.0,
+      "step": 2220
+    },
+    {
+      "entropy": 1.4636402249336242,
+      "epoch": 2.4239130434782608,
+      "grad_norm": 2.156698226928711,
+      "learning_rate": 9.022785604983139e-05,
+      "loss": 1.0813,
+      "mean_token_accuracy": 0.6157242119312286,
+      "num_tokens": 2882741.0,
+      "step": 2230
+    },
+    {
+      "entropy": 1.448258113861084,
+      "epoch": 2.4347826086956523,
+      "grad_norm": 2.1612956523895264,
+      "learning_rate": 9.01208621778606e-05,
+      "loss": 1.1146,
+      "mean_token_accuracy": 0.5982438385486603,
+      "num_tokens": 2895598.0,
+      "step": 2240
+    },
+    {
+      "entropy": 1.4485355496406556,
+      "epoch": 2.4456521739130435,
+      "grad_norm": 2.342386245727539,
+      "learning_rate": 9.001334992776094e-05,
+      "loss": 1.1075,
+      "mean_token_accuracy": 0.59824697971344,
+      "num_tokens": 2908649.0,
+      "step": 2250
+    },
+    {
+      "entropy": 1.4362555623054505,
+      "epoch": 2.4565217391304346,
+      "grad_norm": 2.226168394088745,
+      "learning_rate": 8.990532068863513e-05,
+      "loss": 1.0506,
+      "mean_token_accuracy": 0.6281331360340119,
+      "num_tokens": 2921983.0,
+      "step": 2260
+    },
+    {
+      "entropy": 1.453425133228302,
+      "epoch": 2.467391304347826,
+      "grad_norm": 2.0690982341766357,
+      "learning_rate": 8.979677585626559e-05,
+      "loss": 1.1294,
+      "mean_token_accuracy": 0.5930320501327515,
+      "num_tokens": 2934939.0,
+      "step": 2270
+    },
+    {
+      "entropy": 1.4780887007713317,
+      "epoch": 2.4782608695652173,
+      "grad_norm": 1.5956579446792603,
+      "learning_rate": 8.968771683309645e-05,
+      "loss": 1.1802,
+      "mean_token_accuracy": 0.5728381037712097,
+      "num_tokens": 2947635.0,
+      "step": 2280
+    },
+    {
+      "entropy": 1.4748708367347718,
+      "epoch": 2.489130434782609,
+      "grad_norm": 2.1426117420196533,
+      "learning_rate": 8.95781450282154e-05,
+      "loss": 1.1558,
+      "mean_token_accuracy": 0.591809231042862,
+      "num_tokens": 2960233.0,
+      "step": 2290
+    },
+    {
+      "entropy": 1.4672940373420715,
+      "epoch": 2.5,
+      "grad_norm": 2.2189557552337646,
+      "learning_rate": 8.946806185733543e-05,
+      "loss": 1.0865,
+      "mean_token_accuracy": 0.6044426262378693,
+      "num_tokens": 2973559.0,
+      "step": 2300
+    },
+    {
+      "entropy": 1.4749647736549378,
+      "epoch": 2.5108695652173916,
+      "grad_norm": 2.0216891765594482,
+      "learning_rate": 8.935746874277667e-05,
+      "loss": 1.1216,
+      "mean_token_accuracy": 0.5929350137710572,
+      "num_tokens": 2986671.0,
+      "step": 2310
+    },
+    {
+      "entropy": 1.468804383277893,
+      "epoch": 2.5217391304347827,
+      "grad_norm": 1.746561050415039,
+      "learning_rate": 8.924636711344784e-05,
+      "loss": 1.1529,
+      "mean_token_accuracy": 0.5794779539108277,
+      "num_tokens": 2999818.0,
+      "step": 2320
+    },
+    {
+      "entropy": 1.4579639315605164,
+      "epoch": 2.532608695652174,
+      "grad_norm": 2.7989909648895264,
+      "learning_rate": 8.913475840482797e-05,
+      "loss": 1.1037,
+      "mean_token_accuracy": 0.5982204794883728,
+      "num_tokens": 3012539.0,
+      "step": 2330
+    },
+    {
+      "entropy": 1.4399624347686768,
+      "epoch": 2.5434782608695654,
+      "grad_norm": 2.2155885696411133,
+      "learning_rate": 8.902264405894771e-05,
+      "loss": 1.0702,
+      "mean_token_accuracy": 0.6180161297321319,
+      "num_tokens": 3025100.0,
+      "step": 2340
+    },
+    {
+      "entropy": 1.4719431400299072,
+      "epoch": 2.5543478260869565,
+      "grad_norm": 2.0845561027526855,
+      "learning_rate": 8.891002552437076e-05,
+      "loss": 1.1735,
+      "mean_token_accuracy": 0.5835448384284974,
+      "num_tokens": 3037684.0,
+      "step": 2350
+    },
+    {
+      "entropy": 1.4472892165184021,
+      "epoch": 2.5652173913043477,
+      "grad_norm": 1.958483099937439,
+      "learning_rate": 8.879690425617517e-05,
+      "loss": 1.0761,
+      "mean_token_accuracy": 0.607106763124466,
+      "num_tokens": 3050626.0,
+      "step": 2360
+    },
+    {
+      "entropy": 1.4262180566787719,
+      "epoch": 2.5760869565217392,
+      "grad_norm": 2.5125794410705566,
+      "learning_rate": 8.868328171593448e-05,
+      "loss": 1.0234,
+      "mean_token_accuracy": 0.6303494095802307,
+      "num_tokens": 3063145.0,
+      "step": 2370
+    },
+    {
+      "entropy": 1.4762691497802733,
+      "epoch": 2.5869565217391304,
+      "grad_norm": 1.9473073482513428,
+      "learning_rate": 8.85691593716989e-05,
+      "loss": 1.1387,
+      "mean_token_accuracy": 0.5892048954963685,
+      "num_tokens": 3076300.0,
+      "step": 2380
+    },
+    {
+      "entropy": 1.4989805698394776,
+      "epoch": 2.5978260869565215,
+      "grad_norm": 2.1876816749572754,
+      "learning_rate": 8.845453869797631e-05,
+      "loss": 1.1517,
+      "mean_token_accuracy": 0.5851214408874512,
+      "num_tokens": 3089439.0,
+      "step": 2390
+    },
+    {
+      "entropy": 1.4469679355621339,
+      "epoch": 2.608695652173913,
+      "grad_norm": 2.1269173622131348,
+      "learning_rate": 8.833942117571321e-05,
+      "loss": 1.0865,
+      "mean_token_accuracy": 0.604921555519104,
+      "num_tokens": 3102252.0,
+      "step": 2400
+    },
+    {
+      "entropy": 1.4462681651115417,
+      "epoch": 2.619565217391304,
+      "grad_norm": 2.108311176300049,
+      "learning_rate": 8.822380829227559e-05,
+      "loss": 1.0939,
+      "mean_token_accuracy": 0.6013546764850617,
+      "num_tokens": 3115257.0,
+      "step": 2410
+    },
+    {
+      "entropy": 1.452495265007019,
+      "epoch": 2.630434782608696,
+      "grad_norm": 1.948119878768921,
+      "learning_rate": 8.810770154142969e-05,
+      "loss": 1.1095,
+      "mean_token_accuracy": 0.6048475563526153,
+      "num_tokens": 3128271.0,
+      "step": 2420
+    },
+    {
+      "entropy": 1.4279615521430968,
+      "epoch": 2.641304347826087,
+      "grad_norm": 2.3602614402770996,
+      "learning_rate": 8.799110242332276e-05,
+      "loss": 1.0708,
+      "mean_token_accuracy": 0.6152625262737275,
+      "num_tokens": 3140987.0,
+      "step": 2430
+    },
+    {
+      "entropy": 1.445945417881012,
+      "epoch": 2.6521739130434785,
+      "grad_norm": 1.9659548997879028,
+      "learning_rate": 8.787401244446361e-05,
+      "loss": 1.1057,
+      "mean_token_accuracy": 0.6044125318527221,
+      "num_tokens": 3154073.0,
+      "step": 2440
+    },
+    {
+      "entropy": 1.4435201168060303,
+      "epoch": 2.6630434782608696,
+      "grad_norm": 2.0865726470947266,
+      "learning_rate": 8.775643311770318e-05,
+      "loss": 1.0508,
+      "mean_token_accuracy": 0.6200963973999023,
+      "num_tokens": 3166774.0,
+      "step": 2450
+    },
+    {
+      "entropy": 1.4427612900733948,
+      "epoch": 2.6739130434782608,
+      "grad_norm": 1.95523202419281,
+      "learning_rate": 8.7638365962215e-05,
+      "loss": 1.0604,
+      "mean_token_accuracy": 0.6158404231071473,
+      "num_tokens": 3180239.0,
+      "step": 2460
+    },
+    {
+      "entropy": 1.480810034275055,
+      "epoch": 2.6847826086956523,
+      "grad_norm": 1.7022428512573242,
+      "learning_rate": 8.751981250347552e-05,
+      "loss": 1.1435,
+      "mean_token_accuracy": 0.5830896198749542,
+      "num_tokens": 3193085.0,
+      "step": 2470
+    },
+    {
+      "entropy": 1.4741726160049438,
+      "epoch": 2.6956521739130435,
+      "grad_norm": 1.9417518377304077,
+      "learning_rate": 8.740077427324446e-05,
+      "loss": 1.1434,
+      "mean_token_accuracy": 0.5773356080055236,
+      "num_tokens": 3206199.0,
+      "step": 2480
+    },
+    {
+      "entropy": 1.4773874044418336,
+      "epoch": 2.7065217391304346,
+      "grad_norm": 1.9512629508972168,
+      "learning_rate": 8.728125280954498e-05,
+      "loss": 1.1219,
+      "mean_token_accuracy": 0.5898585200309754,
+      "num_tokens": 3219296.0,
+      "step": 2490
+    },
+    {
+      "entropy": 1.4541948914527894,
+      "epoch": 2.717391304347826,
+      "grad_norm": 2.068798542022705,
+      "learning_rate": 8.716124965664379e-05,
+      "loss": 1.1123,
+      "mean_token_accuracy": 0.6047747969627381,
+      "num_tokens": 3232042.0,
+      "step": 2500
+    },
+    {
+      "entropy": 1.4830847024917602,
+      "epoch": 2.7282608695652173,
+      "grad_norm": 1.9898854494094849,
+      "learning_rate": 8.704076636503128e-05,
+      "loss": 1.1671,
+      "mean_token_accuracy": 0.5747832953929901,
+      "num_tokens": 3244815.0,
+      "step": 2510
+    },
+    {
+      "entropy": 1.4612651348114014,
+      "epoch": 2.7391304347826084,
+      "grad_norm": 2.2067410945892334,
+      "learning_rate": 8.691980449140135e-05,
+      "loss": 1.0978,
+      "mean_token_accuracy": 0.5956085741519928,
+      "num_tokens": 3257668.0,
+      "step": 2520
+    },
+    {
+      "entropy": 1.462200677394867,
+      "epoch": 2.75,
+      "grad_norm": 2.014509439468384,
+      "learning_rate": 8.679836559863148e-05,
+      "loss": 1.1169,
+      "mean_token_accuracy": 0.5984118282794952,
+      "num_tokens": 3270813.0,
+      "step": 2530
+    },
+    {
+      "entropy": 1.4501710295677186,
+      "epoch": 2.7608695652173916,
+      "grad_norm": 2.5156266689300537,
+      "learning_rate": 8.667645125576235e-05,
+      "loss": 1.1319,
+      "mean_token_accuracy": 0.5989238739013671,
+      "num_tokens": 3283624.0,
+      "step": 2540
+    },
+    {
+      "entropy": 1.4544371724128724,
+      "epoch": 2.7717391304347827,
+      "grad_norm": 1.9277708530426025,
+      "learning_rate": 8.655406303797767e-05,
+      "loss": 1.0811,
+      "mean_token_accuracy": 0.6054942727088928,
+      "num_tokens": 3296764.0,
+      "step": 2550
+    },
+    {
+      "entropy": 1.462872362136841,
+      "epoch": 2.782608695652174,
+      "grad_norm": 2.244176149368286,
+      "learning_rate": 8.643120252658381e-05,
+      "loss": 1.1231,
+      "mean_token_accuracy": 0.5996806621551514,
+      "num_tokens": 3310476.0,
+      "step": 2560
+    },
+    {
+      "entropy": 1.4650163173675537,
+      "epoch": 2.7934782608695654,
+      "grad_norm": 2.060368537902832,
+      "learning_rate": 8.630787130898943e-05,
+      "loss": 1.1183,
+      "mean_token_accuracy": 0.5905598402023315,
+      "num_tokens": 3323159.0,
+      "step": 2570
+    },
+    {
+      "entropy": 1.4386309385299683,
+      "epoch": 2.8043478260869565,
+      "grad_norm": 1.8049969673156738,
+      "learning_rate": 8.618407097868482e-05,
+      "loss": 1.0914,
+      "mean_token_accuracy": 0.6009307503700256,
+      "num_tokens": 3336124.0,
+      "step": 2580
+    },
+    {
+      "entropy": 1.4660000324249267,
+      "epoch": 2.8152173913043477,
+      "grad_norm": 1.997644066810608,
+      "learning_rate": 8.605980313522142e-05,
+      "loss": 1.1499,
+      "mean_token_accuracy": 0.5847253501415253,
+      "num_tokens": 3349224.0,
+      "step": 2590
+    },
+    {
+      "entropy": 1.432177233695984,
+      "epoch": 2.8260869565217392,
+      "grad_norm": 2.895296096801758,
+      "learning_rate": 8.59350693841912e-05,
+      "loss": 1.0535,
+      "mean_token_accuracy": 0.622643232345581,
+      "num_tokens": 3362020.0,
+      "step": 2600
+    },
+    {
+      "entropy": 1.4307915687561035,
+      "epoch": 2.8369565217391304,
+      "grad_norm": 2.170982599258423,
+      "learning_rate": 8.580987133720576e-05,
+      "loss": 1.0787,
+      "mean_token_accuracy": 0.6147644519805908,
+      "num_tokens": 3374950.0,
+      "step": 2610
+    },
+    {
+      "entropy": 1.4522701263427735,
+      "epoch": 2.8478260869565215,
+      "grad_norm": 2.1109468936920166,
+      "learning_rate": 8.568421061187567e-05,
+      "loss": 1.0683,
+      "mean_token_accuracy": 0.6159465253353119,
+      "num_tokens": 3387887.0,
+      "step": 2620
+    },
+    {
+      "entropy": 1.435029399394989,
+      "epoch": 2.858695652173913,
+      "grad_norm": 2.271132469177246,
+      "learning_rate": 8.55580888317894e-05,
+      "loss": 1.0815,
+      "mean_token_accuracy": 0.6143145263195038,
+      "num_tokens": 3401084.0,
+      "step": 2630
+    },
+    {
+      "entropy": 1.4437922477722167,
+      "epoch": 2.869565217391304,
+      "grad_norm": 2.1083803176879883,
+      "learning_rate": 8.543150762649257e-05,
+      "loss": 1.1018,
+      "mean_token_accuracy": 0.6052383124828339,
+      "num_tokens": 3414604.0,
+      "step": 2640
+    },
+    {
+      "entropy": 1.4524288296699523,
+      "epoch": 2.880434782608696,
+      "grad_norm": 2.2077994346618652,
+      "learning_rate": 8.530446863146664e-05,
+      "loss": 1.1257,
+      "mean_token_accuracy": 0.5953928649425506,
+      "num_tokens": 3427383.0,
+      "step": 2650
+    },
+    {
+      "entropy": 1.4274506211280822,
+      "epoch": 2.891304347826087,
+      "grad_norm": 2.1553750038146973,
+      "learning_rate": 8.517697348810798e-05,
+      "loss": 1.1036,
+      "mean_token_accuracy": 0.6045750975608826,
+      "num_tokens": 3440363.0,
+      "step": 2660
+    },
+    {
+      "entropy": 1.4272564888000487,
+      "epoch": 2.9021739130434785,
+      "grad_norm": 1.6765693426132202,
+      "learning_rate": 8.504902384370657e-05,
+      "loss": 1.0936,
+      "mean_token_accuracy": 0.6030111908912659,
+      "num_tokens": 3453079.0,
+      "step": 2670
+    },
+    {
+      "entropy": 1.4609074950218202,
+      "epoch": 2.9130434782608696,
+      "grad_norm": 2.0705795288085938,
+      "learning_rate": 8.492062135142469e-05,
+      "loss": 1.1471,
+      "mean_token_accuracy": 0.5836342215538025,
+      "num_tokens": 3466067.0,
+      "step": 2680
+    },
+    {
+      "entropy": 1.4428972482681275,
+      "epoch": 2.9239130434782608,
+      "grad_norm": 1.660536289215088,
+      "learning_rate": 8.479176767027566e-05,
+      "loss": 1.1191,
+      "mean_token_accuracy": 0.5946624577045441,
+      "num_tokens": 3479321.0,
+      "step": 2690
+    },
+    {
+      "entropy": 1.4401050209999084,
+      "epoch": 2.9347826086956523,
+      "grad_norm": 2.135024070739746,
+      "learning_rate": 8.466246446510231e-05,
+      "loss": 1.1123,
+      "mean_token_accuracy": 0.5984612464904785,
+      "num_tokens": 3492053.0,
+      "step": 2700
+    },
+    {
+      "entropy": 1.4631260633468628,
+      "epoch": 2.9456521739130435,
+      "grad_norm": 1.980193853378296,
+      "learning_rate": 8.45327134065555e-05,
+      "loss": 1.1476,
+      "mean_token_accuracy": 0.5893894731998444,
+      "num_tokens": 3505044.0,
+      "step": 2710
+    },
+    {
+      "entropy": 1.4601631045341492,
+      "epoch": 2.9565217391304346,
+      "grad_norm": 1.9377002716064453,
+      "learning_rate": 8.44025161710726e-05,
+      "loss": 1.0807,
+      "mean_token_accuracy": 0.6130505263805389,
+      "num_tokens": 3518122.0,
+      "step": 2720
+    },
+    {
+      "entropy": 1.4349589943885803,
+      "epoch": 2.967391304347826,
+      "grad_norm": 2.004295587539673,
+      "learning_rate": 8.42718744408557e-05,
+      "loss": 1.0994,
+      "mean_token_accuracy": 0.6051357507705688,
+      "num_tokens": 3530855.0,
+      "step": 2730
+    },
+    {
+      "entropy": 1.4589566707611084,
+      "epoch": 2.9782608695652173,
+      "grad_norm": 2.228742837905884,
+      "learning_rate": 8.414078990384995e-05,
+      "loss": 1.1294,
+      "mean_token_accuracy": 0.5972096085548401,
+      "num_tokens": 3543926.0,
+      "step": 2740
+    },
+    {
+      "entropy": 1.4834345698356628,
+      "epoch": 2.9891304347826084,
+      "grad_norm": 1.964888095855713,
+      "learning_rate": 8.400926425372182e-05,
+      "loss": 1.182,
+      "mean_token_accuracy": 0.582332044839859,
+      "num_tokens": 3556590.0,
+      "step": 2750
+    },
+    {
+      "entropy": 1.4434187054634093,
+      "epoch": 3.0,
+      "grad_norm": 2.4529945850372314,
+      "learning_rate": 8.387729918983706e-05,
+      "loss": 1.0934,
+      "mean_token_accuracy": 0.6053484171628952,
+      "num_tokens": 3569982.0,
+      "step": 2760
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 9200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.518842606712238e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39fc106588aab87c0c75f9a3096b4f5a4f8b5d70b6ee9265174e6306e3ca2d67
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b4662618e7ac602191b7ecb54df44c2644a9b346fb4c281593eaa7b19b6eaa8
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896