agu18dec commited on 26 days ago

Commit

e77c492

verified ·

1 Parent(s): d0d7f35

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/README.md +61 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/trainer_state.json +1044 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/trainer_state.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/trainer_state.json +2064 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/adapter_config.json +48 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -589,3 +589,14 @@ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/chec
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-8648/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-9729/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-8648/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/checkpoint-9729/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a4_B1_L20_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-4072/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-5090/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-6108/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-7126/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-8144/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-9162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/lglg8c92)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.19.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.9.1
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:381732958f79fd21a1d81d99b3da9598d3ece25b8d96f4eb721a0f5a6e987c38
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:596c7bc50e314ac66495722616bce02e4951620b0536d5c59cb0ab960f5b9304
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1044 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1018,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.238373827934265,
+      "epoch": 0.009823182711198428,
+      "grad_norm": 2.1393158435821533,
+      "learning_rate": 1.768172888015717e-06,
+      "loss": 0.5677,
+      "mean_token_accuracy": 0.7972675561904907,
+      "num_tokens": 13613.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.248021376132965,
+      "epoch": 0.019646365422396856,
+      "grad_norm": 2.1040091514587402,
+      "learning_rate": 3.732809430255403e-06,
+      "loss": 0.6091,
+      "mean_token_accuracy": 0.7844378292560578,
+      "num_tokens": 27766.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2259408831596375,
+      "epoch": 0.029469548133595286,
+      "grad_norm": 2.8529012203216553,
+      "learning_rate": 5.697445972495088e-06,
+      "loss": 0.6106,
+      "mean_token_accuracy": 0.7926133036613464,
+      "num_tokens": 41975.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.241147220134735,
+      "epoch": 0.03929273084479371,
+      "grad_norm": 1.9179528951644897,
+      "learning_rate": 7.662082514734775e-06,
+      "loss": 0.5673,
+      "mean_token_accuracy": 0.7996828734874726,
+      "num_tokens": 55356.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.2450526833534241,
+      "epoch": 0.04911591355599214,
+      "grad_norm": 2.696981191635132,
+      "learning_rate": 9.62671905697446e-06,
+      "loss": 0.5686,
+      "mean_token_accuracy": 0.7941810250282287,
+      "num_tokens": 69627.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2605302929878235,
+      "epoch": 0.05893909626719057,
+      "grad_norm": 2.4317052364349365,
+      "learning_rate": 1.1591355599214145e-05,
+      "loss": 0.5802,
+      "mean_token_accuracy": 0.7867559194564819,
+      "num_tokens": 82999.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.2502994418144227,
+      "epoch": 0.068762278978389,
+      "grad_norm": 2.431112289428711,
+      "learning_rate": 1.3555992141453833e-05,
+      "loss": 0.5592,
+      "mean_token_accuracy": 0.7943781077861786,
+      "num_tokens": 96494.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2417351365089417,
+      "epoch": 0.07858546168958742,
+      "grad_norm": 1.8804676532745361,
+      "learning_rate": 1.5520628683693518e-05,
+      "loss": 0.551,
+      "mean_token_accuracy": 0.7976289927959442,
+      "num_tokens": 110116.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.2354993939399719,
+      "epoch": 0.08840864440078586,
+      "grad_norm": 1.887425184249878,
+      "learning_rate": 1.7485265225933202e-05,
+      "loss": 0.4783,
+      "mean_token_accuracy": 0.830482566356659,
+      "num_tokens": 123915.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.2492876887321471,
+      "epoch": 0.09823182711198428,
+      "grad_norm": 1.531612515449524,
+      "learning_rate": 1.944990176817289e-05,
+      "loss": 0.5771,
+      "mean_token_accuracy": 0.7916022062301635,
+      "num_tokens": 137813.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.2218821048736572,
+      "epoch": 0.10805500982318271,
+      "grad_norm": 1.4735031127929688,
+      "learning_rate": 2.1414538310412574e-05,
+      "loss": 0.5318,
+      "mean_token_accuracy": 0.8113921225070954,
+      "num_tokens": 152025.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.2572803258895875,
+      "epoch": 0.11787819253438114,
+      "grad_norm": 1.5297958850860596,
+      "learning_rate": 2.3379174852652262e-05,
+      "loss": 0.6333,
+      "mean_token_accuracy": 0.7640059947967529,
+      "num_tokens": 165940.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.228113317489624,
+      "epoch": 0.12770137524557956,
+      "grad_norm": 1.594042181968689,
+      "learning_rate": 2.5343811394891947e-05,
+      "loss": 0.5409,
+      "mean_token_accuracy": 0.8003769814968109,
+      "num_tokens": 179385.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.202523648738861,
+      "epoch": 0.137524557956778,
+      "grad_norm": 1.2444961071014404,
+      "learning_rate": 2.730844793713163e-05,
+      "loss": 0.4893,
+      "mean_token_accuracy": 0.8203648805618287,
+      "num_tokens": 193383.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.2140995144844056,
+      "epoch": 0.14734774066797643,
+      "grad_norm": 1.4760793447494507,
+      "learning_rate": 2.9273084479371316e-05,
+      "loss": 0.5903,
+      "mean_token_accuracy": 0.7915676176548004,
+      "num_tokens": 206472.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.2337120175361633,
+      "epoch": 0.15717092337917485,
+      "grad_norm": 1.7599576711654663,
+      "learning_rate": 3.123772102161101e-05,
+      "loss": 0.5066,
+      "mean_token_accuracy": 0.8161738157272339,
+      "num_tokens": 220423.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.2380502581596375,
+      "epoch": 0.16699410609037327,
+      "grad_norm": 1.5037227869033813,
+      "learning_rate": 3.320235756385069e-05,
+      "loss": 0.5422,
+      "mean_token_accuracy": 0.8013253211975098,
+      "num_tokens": 235005.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.2622791171073913,
+      "epoch": 0.17681728880157171,
+      "grad_norm": 1.376826286315918,
+      "learning_rate": 3.5166994106090376e-05,
+      "loss": 0.581,
+      "mean_token_accuracy": 0.7882379591464996,
+      "num_tokens": 248939.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.2461886525154113,
+      "epoch": 0.18664047151277013,
+      "grad_norm": 1.4486498832702637,
+      "learning_rate": 3.713163064833006e-05,
+      "loss": 0.5537,
+      "mean_token_accuracy": 0.7965423583984375,
+      "num_tokens": 263177.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.242757785320282,
+      "epoch": 0.19646365422396855,
+      "grad_norm": 1.8996150493621826,
+      "learning_rate": 3.9096267190569745e-05,
+      "loss": 0.518,
+      "mean_token_accuracy": 0.8233550667762757,
+      "num_tokens": 276500.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.2447792530059814,
+      "epoch": 0.206286836935167,
+      "grad_norm": 1.6521553993225098,
+      "learning_rate": 4.106090373280943e-05,
+      "loss": 0.4893,
+      "mean_token_accuracy": 0.8305010080337525,
+      "num_tokens": 290500.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.2488795518875122,
+      "epoch": 0.21611001964636542,
+      "grad_norm": 1.757602572441101,
+      "learning_rate": 4.302554027504912e-05,
+      "loss": 0.4861,
+      "mean_token_accuracy": 0.82521493434906,
+      "num_tokens": 304231.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.2686235189437867,
+      "epoch": 0.22593320235756384,
+      "grad_norm": 1.4095959663391113,
+      "learning_rate": 4.4990176817288805e-05,
+      "loss": 0.555,
+      "mean_token_accuracy": 0.8019413590431214,
+      "num_tokens": 317976.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.2709406733512878,
+      "epoch": 0.2357563850687623,
+      "grad_norm": 1.935375452041626,
+      "learning_rate": 4.695481335952849e-05,
+      "loss": 0.5649,
+      "mean_token_accuracy": 0.8037905693054199,
+      "num_tokens": 331858.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.2544381499290467,
+      "epoch": 0.2455795677799607,
+      "grad_norm": 1.607476830482483,
+      "learning_rate": 4.8919449901768174e-05,
+      "loss": 0.5283,
+      "mean_token_accuracy": 0.8084118843078614,
+      "num_tokens": 345259.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.2540257692337036,
+      "epoch": 0.2554027504911591,
+      "grad_norm": 1.4414503574371338,
+      "learning_rate": 5.088408644400786e-05,
+      "loss": 0.5299,
+      "mean_token_accuracy": 0.8094106495380402,
+      "num_tokens": 360160.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.2452853798866272,
+      "epoch": 0.26522593320235754,
+      "grad_norm": 1.7544183731079102,
+      "learning_rate": 5.284872298624754e-05,
+      "loss": 0.494,
+      "mean_token_accuracy": 0.820741331577301,
+      "num_tokens": 373823.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.2502075552940368,
+      "epoch": 0.275049115913556,
+      "grad_norm": 1.8113288879394531,
+      "learning_rate": 5.481335952848723e-05,
+      "loss": 0.5062,
+      "mean_token_accuracy": 0.8143646121025085,
+      "num_tokens": 387923.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.2456924200057984,
+      "epoch": 0.28487229862475444,
+      "grad_norm": 1.7740198373794556,
+      "learning_rate": 5.677799607072691e-05,
+      "loss": 0.5139,
+      "mean_token_accuracy": 0.8153488993644714,
+      "num_tokens": 401453.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.2440819978713988,
+      "epoch": 0.29469548133595286,
+      "grad_norm": 1.0885217189788818,
+      "learning_rate": 5.874263261296661e-05,
+      "loss": 0.5045,
+      "mean_token_accuracy": 0.8190208375453949,
+      "num_tokens": 415468.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.2703365564346314,
+      "epoch": 0.3045186640471513,
+      "grad_norm": 1.416458249092102,
+      "learning_rate": 6.0707269155206295e-05,
+      "loss": 0.5524,
+      "mean_token_accuracy": 0.8009683132171631,
+      "num_tokens": 428767.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.2677528381347656,
+      "epoch": 0.3143418467583497,
+      "grad_norm": 1.9646638631820679,
+      "learning_rate": 6.267190569744598e-05,
+      "loss": 0.6255,
+      "mean_token_accuracy": 0.7640788197517395,
+      "num_tokens": 441931.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.2437112927436829,
+      "epoch": 0.3241650294695481,
+      "grad_norm": 1.4724621772766113,
+      "learning_rate": 6.463654223968566e-05,
+      "loss": 0.5284,
+      "mean_token_accuracy": 0.8117543816566467,
+      "num_tokens": 455302.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.26444011926651,
+      "epoch": 0.33398821218074654,
+      "grad_norm": 1.810052752494812,
+      "learning_rate": 6.660117878192535e-05,
+      "loss": 0.546,
+      "mean_token_accuracy": 0.808152836561203,
+      "num_tokens": 468961.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.2795380353927612,
+      "epoch": 0.343811394891945,
+      "grad_norm": 1.4199846982955933,
+      "learning_rate": 6.856581532416503e-05,
+      "loss": 0.5776,
+      "mean_token_accuracy": 0.7861001551151275,
+      "num_tokens": 482094.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.2831590175628662,
+      "epoch": 0.35363457760314343,
+      "grad_norm": 1.4240669012069702,
+      "learning_rate": 7.053045186640472e-05,
+      "loss": 0.5551,
+      "mean_token_accuracy": 0.7980610370635987,
+      "num_tokens": 495551.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.3066895961761475,
+      "epoch": 0.36345776031434185,
+      "grad_norm": 3.1153318881988525,
+      "learning_rate": 7.249508840864441e-05,
+      "loss": 0.6116,
+      "mean_token_accuracy": 0.7829572439193726,
+      "num_tokens": 509826.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.2638443112373352,
+      "epoch": 0.37328094302554027,
+      "grad_norm": 2.1263859272003174,
+      "learning_rate": 7.445972495088409e-05,
+      "loss": 0.5183,
+      "mean_token_accuracy": 0.8127178907394409,
+      "num_tokens": 523744.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.2958194017410278,
+      "epoch": 0.3831041257367387,
+      "grad_norm": 1.4014490842819214,
+      "learning_rate": 7.642436149312378e-05,
+      "loss": 0.6537,
+      "mean_token_accuracy": 0.7570026934146881,
+      "num_tokens": 537454.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.2576130390167237,
+      "epoch": 0.3929273084479371,
+      "grad_norm": 1.755161166191101,
+      "learning_rate": 7.838899803536346e-05,
+      "loss": 0.4932,
+      "mean_token_accuracy": 0.8207253098487854,
+      "num_tokens": 551546.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.2616795778274537,
+      "epoch": 0.4027504911591356,
+      "grad_norm": 1.5902996063232422,
+      "learning_rate": 8.035363457760315e-05,
+      "loss": 0.5245,
+      "mean_token_accuracy": 0.8081269204616547,
+      "num_tokens": 565707.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.2840699076652526,
+      "epoch": 0.412573673870334,
+      "grad_norm": 1.6743026971817017,
+      "learning_rate": 8.231827111984284e-05,
+      "loss": 0.5767,
+      "mean_token_accuracy": 0.7862762212753296,
+      "num_tokens": 579728.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.2563459753990174,
+      "epoch": 0.4223968565815324,
+      "grad_norm": 1.193405032157898,
+      "learning_rate": 8.428290766208252e-05,
+      "loss": 0.543,
+      "mean_token_accuracy": 0.8090866565704345,
+      "num_tokens": 593579.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.283545970916748,
+      "epoch": 0.43222003929273084,
+      "grad_norm": 1.5221855640411377,
+      "learning_rate": 8.62475442043222e-05,
+      "loss": 0.6117,
+      "mean_token_accuracy": 0.7837108314037323,
+      "num_tokens": 606791.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.2693499445915222,
+      "epoch": 0.44204322200392926,
+      "grad_norm": 1.6853163242340088,
+      "learning_rate": 8.821218074656188e-05,
+      "loss": 0.5621,
+      "mean_token_accuracy": 0.7976293742656708,
+      "num_tokens": 620602.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.2506368160247803,
+      "epoch": 0.4518664047151277,
+      "grad_norm": 1.6164072751998901,
+      "learning_rate": 9.017681728880158e-05,
+      "loss": 0.5205,
+      "mean_token_accuracy": 0.807697081565857,
+      "num_tokens": 634566.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.2814919590950011,
+      "epoch": 0.46168958742632615,
+      "grad_norm": 1.7173436880111694,
+      "learning_rate": 9.214145383104125e-05,
+      "loss": 0.5411,
+      "mean_token_accuracy": 0.8121235728263855,
+      "num_tokens": 648676.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.2675794124603272,
+      "epoch": 0.4715127701375246,
+      "grad_norm": 1.446212649345398,
+      "learning_rate": 9.410609037328096e-05,
+      "loss": 0.5563,
+      "mean_token_accuracy": 0.803589540719986,
+      "num_tokens": 662286.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.2913037061691284,
+      "epoch": 0.481335952848723,
+      "grad_norm": 1.630800485610962,
+      "learning_rate": 9.607072691552064e-05,
+      "loss": 0.6166,
+      "mean_token_accuracy": 0.7754902184009552,
+      "num_tokens": 675740.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.2715419769287108,
+      "epoch": 0.4911591355599214,
+      "grad_norm": 1.598403811454773,
+      "learning_rate": 9.803536345776033e-05,
+      "loss": 0.5614,
+      "mean_token_accuracy": 0.7951632618904114,
+      "num_tokens": 689865.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.285755467414856,
+      "epoch": 0.5009823182711198,
+      "grad_norm": 2.030689001083374,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "mean_token_accuracy": 0.8062463700771332,
+      "num_tokens": 703743.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.3047456383705138,
+      "epoch": 0.5108055009823183,
+      "grad_norm": 1.8453326225280762,
+      "learning_rate": 9.999973618674915e-05,
+      "loss": 0.6121,
+      "mean_token_accuracy": 0.7798721611499786,
+      "num_tokens": 717512.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.282051682472229,
+      "epoch": 0.5206286836935167,
+      "grad_norm": 1.6759898662567139,
+      "learning_rate": 9.999894474978048e-05,
+      "loss": 0.5239,
+      "mean_token_accuracy": 0.8078398644924164,
+      "num_tokens": 731162.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2989009261131286,
+      "epoch": 0.5304518664047151,
+      "grad_norm": 1.7401994466781616,
+      "learning_rate": 9.999762569744566e-05,
+      "loss": 0.5816,
+      "mean_token_accuracy": 0.7974128127098083,
+      "num_tokens": 745217.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.3011240839958191,
+      "epoch": 0.5402750491159135,
+      "grad_norm": 1.9052543640136719,
+      "learning_rate": 9.999577904366405e-05,
+      "loss": 0.555,
+      "mean_token_accuracy": 0.7961248874664306,
+      "num_tokens": 758976.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.3102270722389222,
+      "epoch": 0.550098231827112,
+      "grad_norm": 1.7880445718765259,
+      "learning_rate": 9.999340480792247e-05,
+      "loss": 0.6029,
+      "mean_token_accuracy": 0.7876855313777924,
+      "num_tokens": 772471.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.2655692934989928,
+      "epoch": 0.5599214145383105,
+      "grad_norm": 1.7137025594711304,
+      "learning_rate": 9.999050301527515e-05,
+      "loss": 0.5436,
+      "mean_token_accuracy": 0.800448739528656,
+      "num_tokens": 785857.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.2898759365081787,
+      "epoch": 0.5697445972495089,
+      "grad_norm": 2.03338360786438,
+      "learning_rate": 9.998707369634334e-05,
+      "loss": 0.5647,
+      "mean_token_accuracy": 0.7925122499465942,
+      "num_tokens": 799507.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.2878461837768556,
+      "epoch": 0.5795677799607073,
+      "grad_norm": 1.7258025407791138,
+      "learning_rate": 9.998311688731503e-05,
+      "loss": 0.5495,
+      "mean_token_accuracy": 0.7970447540283203,
+      "num_tokens": 813884.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.2761369585990905,
+      "epoch": 0.5893909626719057,
+      "grad_norm": 1.9462448358535767,
+      "learning_rate": 9.997863262994456e-05,
+      "loss": 0.5295,
+      "mean_token_accuracy": 0.8060545325279236,
+      "num_tokens": 827542.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.2756438493728637,
+      "epoch": 0.5992141453831041,
+      "grad_norm": 2.06569242477417,
+      "learning_rate": 9.99736209715522e-05,
+      "loss": 0.5747,
+      "mean_token_accuracy": 0.7955709218978881,
+      "num_tokens": 841676.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.2753918766975403,
+      "epoch": 0.6090373280943026,
+      "grad_norm": 1.7314369678497314,
+      "learning_rate": 9.996808196502362e-05,
+      "loss": 0.5151,
+      "mean_token_accuracy": 0.8180197477340698,
+      "num_tokens": 855269.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.2783099055290221,
+      "epoch": 0.618860510805501,
+      "grad_norm": 1.6164512634277344,
+      "learning_rate": 9.996201566880935e-05,
+      "loss": 0.4961,
+      "mean_token_accuracy": 0.8200631260871887,
+      "num_tokens": 868735.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.2850772857666015,
+      "epoch": 0.6286836935166994,
+      "grad_norm": 1.5462535619735718,
+      "learning_rate": 9.995542214692418e-05,
+      "loss": 0.5916,
+      "mean_token_accuracy": 0.7909732520580292,
+      "num_tokens": 882232.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.2697942018508912,
+      "epoch": 0.6385068762278978,
+      "grad_norm": 1.9398994445800781,
+      "learning_rate": 9.99483014689464e-05,
+      "loss": 0.5054,
+      "mean_token_accuracy": 0.8184501647949218,
+      "num_tokens": 895363.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.3003694057464599,
+      "epoch": 0.6483300589390962,
+      "grad_norm": 1.6913245916366577,
+      "learning_rate": 9.994065371001724e-05,
+      "loss": 0.5658,
+      "mean_token_accuracy": 0.7982197999954224,
+      "num_tokens": 909912.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.3075840830802918,
+      "epoch": 0.6581532416502947,
+      "grad_norm": 1.5393342971801758,
+      "learning_rate": 9.993247895083988e-05,
+      "loss": 0.574,
+      "mean_token_accuracy": 0.7920112848281861,
+      "num_tokens": 923818.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.2735092639923096,
+      "epoch": 0.6679764243614931,
+      "grad_norm": 1.6885005235671997,
+      "learning_rate": 9.99237772776787e-05,
+      "loss": 0.539,
+      "mean_token_accuracy": 0.7991899967193603,
+      "num_tokens": 937453.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.2649645924568176,
+      "epoch": 0.6777996070726916,
+      "grad_norm": 1.463413953781128,
+      "learning_rate": 9.991454878235837e-05,
+      "loss": 0.5361,
+      "mean_token_accuracy": 0.8108624756336212,
+      "num_tokens": 950998.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.2696751356124878,
+      "epoch": 0.68762278978389,
+      "grad_norm": 1.9994075298309326,
+      "learning_rate": 9.990479356226288e-05,
+      "loss": 0.5365,
+      "mean_token_accuracy": 0.8130120277404785,
+      "num_tokens": 964386.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.285075318813324,
+      "epoch": 0.6974459724950884,
+      "grad_norm": 1.7897218465805054,
+      "learning_rate": 9.989451172033447e-05,
+      "loss": 0.5871,
+      "mean_token_accuracy": 0.7820332407951355,
+      "num_tokens": 978060.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.2756176710128784,
+      "epoch": 0.7072691552062869,
+      "grad_norm": 1.7371011972427368,
+      "learning_rate": 9.98837033650726e-05,
+      "loss": 0.5597,
+      "mean_token_accuracy": 0.7937583506107331,
+      "num_tokens": 991672.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.2849397659301758,
+      "epoch": 0.7170923379174853,
+      "grad_norm": 1.641719937324524,
+      "learning_rate": 9.987236861053274e-05,
+      "loss": 0.5843,
+      "mean_token_accuracy": 0.7939905822277069,
+      "num_tokens": 1005457.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.2925720930099487,
+      "epoch": 0.7269155206286837,
+      "grad_norm": 1.8728162050247192,
+      "learning_rate": 9.986050757632525e-05,
+      "loss": 0.5755,
+      "mean_token_accuracy": 0.7945402979850769,
+      "num_tokens": 1019406.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.291877806186676,
+      "epoch": 0.7367387033398821,
+      "grad_norm": 1.6056290864944458,
+      "learning_rate": 9.984812038761405e-05,
+      "loss": 0.6116,
+      "mean_token_accuracy": 0.7776188969612121,
+      "num_tokens": 1032927.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.3065629363059998,
+      "epoch": 0.7465618860510805,
+      "grad_norm": 1.4939526319503784,
+      "learning_rate": 9.983520717511529e-05,
+      "loss": 0.6408,
+      "mean_token_accuracy": 0.7670970022678375,
+      "num_tokens": 1045833.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.282605803012848,
+      "epoch": 0.756385068762279,
+      "grad_norm": 1.7990894317626953,
+      "learning_rate": 9.982176807509607e-05,
+      "loss": 0.5696,
+      "mean_token_accuracy": 0.7870432496070862,
+      "num_tokens": 1059607.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.2711770296096803,
+      "epoch": 0.7662082514734774,
+      "grad_norm": 1.559313416481018,
+      "learning_rate": 9.980780322937287e-05,
+      "loss": 0.5315,
+      "mean_token_accuracy": 0.8101322710514068,
+      "num_tokens": 1073031.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.2708292603492737,
+      "epoch": 0.7760314341846758,
+      "grad_norm": 1.5710434913635254,
+      "learning_rate": 9.979331278531016e-05,
+      "loss": 0.5539,
+      "mean_token_accuracy": 0.8038661122322083,
+      "num_tokens": 1086355.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.3074462771415711,
+      "epoch": 0.7858546168958742,
+      "grad_norm": 1.5835829973220825,
+      "learning_rate": 9.977829689581877e-05,
+      "loss": 0.6236,
+      "mean_token_accuracy": 0.7792715787887573,
+      "num_tokens": 1100429.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.296637237071991,
+      "epoch": 0.7956777996070727,
+      "grad_norm": 1.571682095527649,
+      "learning_rate": 9.976275571935435e-05,
+      "loss": 0.5913,
+      "mean_token_accuracy": 0.7940649032592774,
+      "num_tokens": 1114663.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.294349157810211,
+      "epoch": 0.8055009823182712,
+      "grad_norm": 1.4109233617782593,
+      "learning_rate": 9.974668941991561e-05,
+      "loss": 0.6248,
+      "mean_token_accuracy": 0.7714365422725677,
+      "num_tokens": 1128403.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.319439172744751,
+      "epoch": 0.8153241650294696,
+      "grad_norm": 1.6622600555419922,
+      "learning_rate": 9.973009816704267e-05,
+      "loss": 0.6399,
+      "mean_token_accuracy": 0.774699580669403,
+      "num_tokens": 1142331.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.302856945991516,
+      "epoch": 0.825147347740668,
+      "grad_norm": 2.4136300086975098,
+      "learning_rate": 9.971298213581522e-05,
+      "loss": 0.5716,
+      "mean_token_accuracy": 0.7897345960140228,
+      "num_tokens": 1156242.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.287725281715393,
+      "epoch": 0.8349705304518664,
+      "grad_norm": 1.7344591617584229,
+      "learning_rate": 9.96953415068507e-05,
+      "loss": 0.5818,
+      "mean_token_accuracy": 0.7853143334388732,
+      "num_tokens": 1170390.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.258513343334198,
+      "epoch": 0.8447937131630648,
+      "grad_norm": 1.8267909288406372,
+      "learning_rate": 9.967717646630235e-05,
+      "loss": 0.5366,
+      "mean_token_accuracy": 0.8063280463218689,
+      "num_tokens": 1183788.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.2829570293426513,
+      "epoch": 0.8546168958742633,
+      "grad_norm": 1.7665373086929321,
+      "learning_rate": 9.965848720585734e-05,
+      "loss": 0.5489,
+      "mean_token_accuracy": 0.7993226885795593,
+      "num_tokens": 1197326.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.3128790736198426,
+      "epoch": 0.8644400785854617,
+      "grad_norm": 1.6902852058410645,
+      "learning_rate": 9.963927392273462e-05,
+      "loss": 0.6228,
+      "mean_token_accuracy": 0.7713834345340729,
+      "num_tokens": 1211392.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.3291242480278016,
+      "epoch": 0.8742632612966601,
+      "grad_norm": 2.342031240463257,
+      "learning_rate": 9.961953681968297e-05,
+      "loss": 0.6504,
+      "mean_token_accuracy": 0.7650188624858856,
+      "num_tokens": 1225560.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.3011715769767762,
+      "epoch": 0.8840864440078585,
+      "grad_norm": 2.45995831489563,
+      "learning_rate": 9.959927610497874e-05,
+      "loss": 0.617,
+      "mean_token_accuracy": 0.7712440609931945,
+      "num_tokens": 1239533.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.2857048749923705,
+      "epoch": 0.8939096267190569,
+      "grad_norm": 1.9025800228118896,
+      "learning_rate": 9.957849199242374e-05,
+      "loss": 0.5763,
+      "mean_token_accuracy": 0.787699168920517,
+      "num_tokens": 1253319.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.2860328674316406,
+      "epoch": 0.9037328094302554,
+      "grad_norm": 1.6897069215774536,
+      "learning_rate": 9.955718470134295e-05,
+      "loss": 0.5671,
+      "mean_token_accuracy": 0.8008992373943329,
+      "num_tokens": 1266631.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.2990724086761474,
+      "epoch": 0.9135559921414538,
+      "grad_norm": 1.771283745765686,
+      "learning_rate": 9.953535445658218e-05,
+      "loss": 0.6136,
+      "mean_token_accuracy": 0.7856487035751343,
+      "num_tokens": 1280141.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.2943416357040405,
+      "epoch": 0.9233791748526523,
+      "grad_norm": 2.1040866374969482,
+      "learning_rate": 9.951300148850576e-05,
+      "loss": 0.5738,
+      "mean_token_accuracy": 0.7893698453903198,
+      "num_tokens": 1294070.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.3020179510116576,
+      "epoch": 0.9332023575638507,
+      "grad_norm": 1.752038836479187,
+      "learning_rate": 9.949012603299404e-05,
+      "loss": 0.5919,
+      "mean_token_accuracy": 0.7839440703392029,
+      "num_tokens": 1308196.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.3016840934753418,
+      "epoch": 0.9430255402750491,
+      "grad_norm": 1.8453019857406616,
+      "learning_rate": 9.946672833144097e-05,
+      "loss": 0.5754,
+      "mean_token_accuracy": 0.7904948055744171,
+      "num_tokens": 1322261.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.2948450207710267,
+      "epoch": 0.9528487229862476,
+      "grad_norm": 1.617616891860962,
+      "learning_rate": 9.944280863075148e-05,
+      "loss": 0.5965,
+      "mean_token_accuracy": 0.7802974283695221,
+      "num_tokens": 1336225.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.302824819087982,
+      "epoch": 0.962671905697446,
+      "grad_norm": 1.8175745010375977,
+      "learning_rate": 9.941836718333894e-05,
+      "loss": 0.6292,
+      "mean_token_accuracy": 0.7750960767269135,
+      "num_tokens": 1349509.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.2981536149978639,
+      "epoch": 0.9724950884086444,
+      "grad_norm": 1.411348581314087,
+      "learning_rate": 9.939340424712247e-05,
+      "loss": 0.5127,
+      "mean_token_accuracy": 0.8160092115402222,
+      "num_tokens": 1363443.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.321254277229309,
+      "epoch": 0.9823182711198428,
+      "grad_norm": 1.8093916177749634,
+      "learning_rate": 9.936792008552418e-05,
+      "loss": 0.6142,
+      "mean_token_accuracy": 0.7801197230815887,
+      "num_tokens": 1377815.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.2920042157173157,
+      "epoch": 0.9921414538310412,
+      "grad_norm": 2.154522657394409,
+      "learning_rate": 9.934191496746647e-05,
+      "loss": 0.5433,
+      "mean_token_accuracy": 0.7936553716659546,
+      "num_tokens": 1391706.0,
+      "step": 1010
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10180,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.965273778720256e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca00661b4201b9c900ba613719f42e2216580f8bd1d0e3994cb00560554804cf
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-1018/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:381732958f79fd21a1d81d99b3da9598d3ece25b8d96f4eb721a0f5a6e987c38
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca00661b4201b9c900ba613719f42e2216580f8bd1d0e3994cb00560554804cf
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-10180/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d05c922a29d65fbe2a55b88b10319dc70719fde2d23e3c9840293fed6f1ab313
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2064 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2036,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.238373827934265,
+      "epoch": 0.009823182711198428,
+      "grad_norm": 2.1393158435821533,
+      "learning_rate": 1.768172888015717e-06,
+      "loss": 0.5677,
+      "mean_token_accuracy": 0.7972675561904907,
+      "num_tokens": 13613.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.248021376132965,
+      "epoch": 0.019646365422396856,
+      "grad_norm": 2.1040091514587402,
+      "learning_rate": 3.732809430255403e-06,
+      "loss": 0.6091,
+      "mean_token_accuracy": 0.7844378292560578,
+      "num_tokens": 27766.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2259408831596375,
+      "epoch": 0.029469548133595286,
+      "grad_norm": 2.8529012203216553,
+      "learning_rate": 5.697445972495088e-06,
+      "loss": 0.6106,
+      "mean_token_accuracy": 0.7926133036613464,
+      "num_tokens": 41975.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.241147220134735,
+      "epoch": 0.03929273084479371,
+      "grad_norm": 1.9179528951644897,
+      "learning_rate": 7.662082514734775e-06,
+      "loss": 0.5673,
+      "mean_token_accuracy": 0.7996828734874726,
+      "num_tokens": 55356.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.2450526833534241,
+      "epoch": 0.04911591355599214,
+      "grad_norm": 2.696981191635132,
+      "learning_rate": 9.62671905697446e-06,
+      "loss": 0.5686,
+      "mean_token_accuracy": 0.7941810250282287,
+      "num_tokens": 69627.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2605302929878235,
+      "epoch": 0.05893909626719057,
+      "grad_norm": 2.4317052364349365,
+      "learning_rate": 1.1591355599214145e-05,
+      "loss": 0.5802,
+      "mean_token_accuracy": 0.7867559194564819,
+      "num_tokens": 82999.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.2502994418144227,
+      "epoch": 0.068762278978389,
+      "grad_norm": 2.431112289428711,
+      "learning_rate": 1.3555992141453833e-05,
+      "loss": 0.5592,
+      "mean_token_accuracy": 0.7943781077861786,
+      "num_tokens": 96494.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2417351365089417,
+      "epoch": 0.07858546168958742,
+      "grad_norm": 1.8804676532745361,
+      "learning_rate": 1.5520628683693518e-05,
+      "loss": 0.551,
+      "mean_token_accuracy": 0.7976289927959442,
+      "num_tokens": 110116.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.2354993939399719,
+      "epoch": 0.08840864440078586,
+      "grad_norm": 1.887425184249878,
+      "learning_rate": 1.7485265225933202e-05,
+      "loss": 0.4783,
+      "mean_token_accuracy": 0.830482566356659,
+      "num_tokens": 123915.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.2492876887321471,
+      "epoch": 0.09823182711198428,
+      "grad_norm": 1.531612515449524,
+      "learning_rate": 1.944990176817289e-05,
+      "loss": 0.5771,
+      "mean_token_accuracy": 0.7916022062301635,
+      "num_tokens": 137813.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.2218821048736572,
+      "epoch": 0.10805500982318271,
+      "grad_norm": 1.4735031127929688,
+      "learning_rate": 2.1414538310412574e-05,
+      "loss": 0.5318,
+      "mean_token_accuracy": 0.8113921225070954,
+      "num_tokens": 152025.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.2572803258895875,
+      "epoch": 0.11787819253438114,
+      "grad_norm": 1.5297958850860596,
+      "learning_rate": 2.3379174852652262e-05,
+      "loss": 0.6333,
+      "mean_token_accuracy": 0.7640059947967529,
+      "num_tokens": 165940.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.228113317489624,
+      "epoch": 0.12770137524557956,
+      "grad_norm": 1.594042181968689,
+      "learning_rate": 2.5343811394891947e-05,
+      "loss": 0.5409,
+      "mean_token_accuracy": 0.8003769814968109,
+      "num_tokens": 179385.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.202523648738861,
+      "epoch": 0.137524557956778,
+      "grad_norm": 1.2444961071014404,
+      "learning_rate": 2.730844793713163e-05,
+      "loss": 0.4893,
+      "mean_token_accuracy": 0.8203648805618287,
+      "num_tokens": 193383.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.2140995144844056,
+      "epoch": 0.14734774066797643,
+      "grad_norm": 1.4760793447494507,
+      "learning_rate": 2.9273084479371316e-05,
+      "loss": 0.5903,
+      "mean_token_accuracy": 0.7915676176548004,
+      "num_tokens": 206472.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.2337120175361633,
+      "epoch": 0.15717092337917485,
+      "grad_norm": 1.7599576711654663,
+      "learning_rate": 3.123772102161101e-05,
+      "loss": 0.5066,
+      "mean_token_accuracy": 0.8161738157272339,
+      "num_tokens": 220423.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.2380502581596375,
+      "epoch": 0.16699410609037327,
+      "grad_norm": 1.5037227869033813,
+      "learning_rate": 3.320235756385069e-05,
+      "loss": 0.5422,
+      "mean_token_accuracy": 0.8013253211975098,
+      "num_tokens": 235005.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.2622791171073913,
+      "epoch": 0.17681728880157171,
+      "grad_norm": 1.376826286315918,
+      "learning_rate": 3.5166994106090376e-05,
+      "loss": 0.581,
+      "mean_token_accuracy": 0.7882379591464996,
+      "num_tokens": 248939.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.2461886525154113,
+      "epoch": 0.18664047151277013,
+      "grad_norm": 1.4486498832702637,
+      "learning_rate": 3.713163064833006e-05,
+      "loss": 0.5537,
+      "mean_token_accuracy": 0.7965423583984375,
+      "num_tokens": 263177.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.242757785320282,
+      "epoch": 0.19646365422396855,
+      "grad_norm": 1.8996150493621826,
+      "learning_rate": 3.9096267190569745e-05,
+      "loss": 0.518,
+      "mean_token_accuracy": 0.8233550667762757,
+      "num_tokens": 276500.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.2447792530059814,
+      "epoch": 0.206286836935167,
+      "grad_norm": 1.6521553993225098,
+      "learning_rate": 4.106090373280943e-05,
+      "loss": 0.4893,
+      "mean_token_accuracy": 0.8305010080337525,
+      "num_tokens": 290500.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.2488795518875122,
+      "epoch": 0.21611001964636542,
+      "grad_norm": 1.757602572441101,
+      "learning_rate": 4.302554027504912e-05,
+      "loss": 0.4861,
+      "mean_token_accuracy": 0.82521493434906,
+      "num_tokens": 304231.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.2686235189437867,
+      "epoch": 0.22593320235756384,
+      "grad_norm": 1.4095959663391113,
+      "learning_rate": 4.4990176817288805e-05,
+      "loss": 0.555,
+      "mean_token_accuracy": 0.8019413590431214,
+      "num_tokens": 317976.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.2709406733512878,
+      "epoch": 0.2357563850687623,
+      "grad_norm": 1.935375452041626,
+      "learning_rate": 4.695481335952849e-05,
+      "loss": 0.5649,
+      "mean_token_accuracy": 0.8037905693054199,
+      "num_tokens": 331858.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.2544381499290467,
+      "epoch": 0.2455795677799607,
+      "grad_norm": 1.607476830482483,
+      "learning_rate": 4.8919449901768174e-05,
+      "loss": 0.5283,
+      "mean_token_accuracy": 0.8084118843078614,
+      "num_tokens": 345259.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.2540257692337036,
+      "epoch": 0.2554027504911591,
+      "grad_norm": 1.4414503574371338,
+      "learning_rate": 5.088408644400786e-05,
+      "loss": 0.5299,
+      "mean_token_accuracy": 0.8094106495380402,
+      "num_tokens": 360160.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.2452853798866272,
+      "epoch": 0.26522593320235754,
+      "grad_norm": 1.7544183731079102,
+      "learning_rate": 5.284872298624754e-05,
+      "loss": 0.494,
+      "mean_token_accuracy": 0.820741331577301,
+      "num_tokens": 373823.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.2502075552940368,
+      "epoch": 0.275049115913556,
+      "grad_norm": 1.8113288879394531,
+      "learning_rate": 5.481335952848723e-05,
+      "loss": 0.5062,
+      "mean_token_accuracy": 0.8143646121025085,
+      "num_tokens": 387923.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.2456924200057984,
+      "epoch": 0.28487229862475444,
+      "grad_norm": 1.7740198373794556,
+      "learning_rate": 5.677799607072691e-05,
+      "loss": 0.5139,
+      "mean_token_accuracy": 0.8153488993644714,
+      "num_tokens": 401453.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.2440819978713988,
+      "epoch": 0.29469548133595286,
+      "grad_norm": 1.0885217189788818,
+      "learning_rate": 5.874263261296661e-05,
+      "loss": 0.5045,
+      "mean_token_accuracy": 0.8190208375453949,
+      "num_tokens": 415468.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.2703365564346314,
+      "epoch": 0.3045186640471513,
+      "grad_norm": 1.416458249092102,
+      "learning_rate": 6.0707269155206295e-05,
+      "loss": 0.5524,
+      "mean_token_accuracy": 0.8009683132171631,
+      "num_tokens": 428767.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.2677528381347656,
+      "epoch": 0.3143418467583497,
+      "grad_norm": 1.9646638631820679,
+      "learning_rate": 6.267190569744598e-05,
+      "loss": 0.6255,
+      "mean_token_accuracy": 0.7640788197517395,
+      "num_tokens": 441931.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.2437112927436829,
+      "epoch": 0.3241650294695481,
+      "grad_norm": 1.4724621772766113,
+      "learning_rate": 6.463654223968566e-05,
+      "loss": 0.5284,
+      "mean_token_accuracy": 0.8117543816566467,
+      "num_tokens": 455302.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.26444011926651,
+      "epoch": 0.33398821218074654,
+      "grad_norm": 1.810052752494812,
+      "learning_rate": 6.660117878192535e-05,
+      "loss": 0.546,
+      "mean_token_accuracy": 0.808152836561203,
+      "num_tokens": 468961.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.2795380353927612,
+      "epoch": 0.343811394891945,
+      "grad_norm": 1.4199846982955933,
+      "learning_rate": 6.856581532416503e-05,
+      "loss": 0.5776,
+      "mean_token_accuracy": 0.7861001551151275,
+      "num_tokens": 482094.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.2831590175628662,
+      "epoch": 0.35363457760314343,
+      "grad_norm": 1.4240669012069702,
+      "learning_rate": 7.053045186640472e-05,
+      "loss": 0.5551,
+      "mean_token_accuracy": 0.7980610370635987,
+      "num_tokens": 495551.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.3066895961761475,
+      "epoch": 0.36345776031434185,
+      "grad_norm": 3.1153318881988525,
+      "learning_rate": 7.249508840864441e-05,
+      "loss": 0.6116,
+      "mean_token_accuracy": 0.7829572439193726,
+      "num_tokens": 509826.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.2638443112373352,
+      "epoch": 0.37328094302554027,
+      "grad_norm": 2.1263859272003174,
+      "learning_rate": 7.445972495088409e-05,
+      "loss": 0.5183,
+      "mean_token_accuracy": 0.8127178907394409,
+      "num_tokens": 523744.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.2958194017410278,
+      "epoch": 0.3831041257367387,
+      "grad_norm": 1.4014490842819214,
+      "learning_rate": 7.642436149312378e-05,
+      "loss": 0.6537,
+      "mean_token_accuracy": 0.7570026934146881,
+      "num_tokens": 537454.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.2576130390167237,
+      "epoch": 0.3929273084479371,
+      "grad_norm": 1.755161166191101,
+      "learning_rate": 7.838899803536346e-05,
+      "loss": 0.4932,
+      "mean_token_accuracy": 0.8207253098487854,
+      "num_tokens": 551546.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.2616795778274537,
+      "epoch": 0.4027504911591356,
+      "grad_norm": 1.5902996063232422,
+      "learning_rate": 8.035363457760315e-05,
+      "loss": 0.5245,
+      "mean_token_accuracy": 0.8081269204616547,
+      "num_tokens": 565707.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.2840699076652526,
+      "epoch": 0.412573673870334,
+      "grad_norm": 1.6743026971817017,
+      "learning_rate": 8.231827111984284e-05,
+      "loss": 0.5767,
+      "mean_token_accuracy": 0.7862762212753296,
+      "num_tokens": 579728.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.2563459753990174,
+      "epoch": 0.4223968565815324,
+      "grad_norm": 1.193405032157898,
+      "learning_rate": 8.428290766208252e-05,
+      "loss": 0.543,
+      "mean_token_accuracy": 0.8090866565704345,
+      "num_tokens": 593579.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.283545970916748,
+      "epoch": 0.43222003929273084,
+      "grad_norm": 1.5221855640411377,
+      "learning_rate": 8.62475442043222e-05,
+      "loss": 0.6117,
+      "mean_token_accuracy": 0.7837108314037323,
+      "num_tokens": 606791.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.2693499445915222,
+      "epoch": 0.44204322200392926,
+      "grad_norm": 1.6853163242340088,
+      "learning_rate": 8.821218074656188e-05,
+      "loss": 0.5621,
+      "mean_token_accuracy": 0.7976293742656708,
+      "num_tokens": 620602.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.2506368160247803,
+      "epoch": 0.4518664047151277,
+      "grad_norm": 1.6164072751998901,
+      "learning_rate": 9.017681728880158e-05,
+      "loss": 0.5205,
+      "mean_token_accuracy": 0.807697081565857,
+      "num_tokens": 634566.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.2814919590950011,
+      "epoch": 0.46168958742632615,
+      "grad_norm": 1.7173436880111694,
+      "learning_rate": 9.214145383104125e-05,
+      "loss": 0.5411,
+      "mean_token_accuracy": 0.8121235728263855,
+      "num_tokens": 648676.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.2675794124603272,
+      "epoch": 0.4715127701375246,
+      "grad_norm": 1.446212649345398,
+      "learning_rate": 9.410609037328096e-05,
+      "loss": 0.5563,
+      "mean_token_accuracy": 0.803589540719986,
+      "num_tokens": 662286.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.2913037061691284,
+      "epoch": 0.481335952848723,
+      "grad_norm": 1.630800485610962,
+      "learning_rate": 9.607072691552064e-05,
+      "loss": 0.6166,
+      "mean_token_accuracy": 0.7754902184009552,
+      "num_tokens": 675740.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.2715419769287108,
+      "epoch": 0.4911591355599214,
+      "grad_norm": 1.598403811454773,
+      "learning_rate": 9.803536345776033e-05,
+      "loss": 0.5614,
+      "mean_token_accuracy": 0.7951632618904114,
+      "num_tokens": 689865.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.285755467414856,
+      "epoch": 0.5009823182711198,
+      "grad_norm": 2.030689001083374,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "mean_token_accuracy": 0.8062463700771332,
+      "num_tokens": 703743.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.3047456383705138,
+      "epoch": 0.5108055009823183,
+      "grad_norm": 1.8453326225280762,
+      "learning_rate": 9.999973618674915e-05,
+      "loss": 0.6121,
+      "mean_token_accuracy": 0.7798721611499786,
+      "num_tokens": 717512.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.282051682472229,
+      "epoch": 0.5206286836935167,
+      "grad_norm": 1.6759898662567139,
+      "learning_rate": 9.999894474978048e-05,
+      "loss": 0.5239,
+      "mean_token_accuracy": 0.8078398644924164,
+      "num_tokens": 731162.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2989009261131286,
+      "epoch": 0.5304518664047151,
+      "grad_norm": 1.7401994466781616,
+      "learning_rate": 9.999762569744566e-05,
+      "loss": 0.5816,
+      "mean_token_accuracy": 0.7974128127098083,
+      "num_tokens": 745217.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.3011240839958191,
+      "epoch": 0.5402750491159135,
+      "grad_norm": 1.9052543640136719,
+      "learning_rate": 9.999577904366405e-05,
+      "loss": 0.555,
+      "mean_token_accuracy": 0.7961248874664306,
+      "num_tokens": 758976.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.3102270722389222,
+      "epoch": 0.550098231827112,
+      "grad_norm": 1.7880445718765259,
+      "learning_rate": 9.999340480792247e-05,
+      "loss": 0.6029,
+      "mean_token_accuracy": 0.7876855313777924,
+      "num_tokens": 772471.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.2655692934989928,
+      "epoch": 0.5599214145383105,
+      "grad_norm": 1.7137025594711304,
+      "learning_rate": 9.999050301527515e-05,
+      "loss": 0.5436,
+      "mean_token_accuracy": 0.800448739528656,
+      "num_tokens": 785857.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.2898759365081787,
+      "epoch": 0.5697445972495089,
+      "grad_norm": 2.03338360786438,
+      "learning_rate": 9.998707369634334e-05,
+      "loss": 0.5647,
+      "mean_token_accuracy": 0.7925122499465942,
+      "num_tokens": 799507.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.2878461837768556,
+      "epoch": 0.5795677799607073,
+      "grad_norm": 1.7258025407791138,
+      "learning_rate": 9.998311688731503e-05,
+      "loss": 0.5495,
+      "mean_token_accuracy": 0.7970447540283203,
+      "num_tokens": 813884.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.2761369585990905,
+      "epoch": 0.5893909626719057,
+      "grad_norm": 1.9462448358535767,
+      "learning_rate": 9.997863262994456e-05,
+      "loss": 0.5295,
+      "mean_token_accuracy": 0.8060545325279236,
+      "num_tokens": 827542.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.2756438493728637,
+      "epoch": 0.5992141453831041,
+      "grad_norm": 2.06569242477417,
+      "learning_rate": 9.99736209715522e-05,
+      "loss": 0.5747,
+      "mean_token_accuracy": 0.7955709218978881,
+      "num_tokens": 841676.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.2753918766975403,
+      "epoch": 0.6090373280943026,
+      "grad_norm": 1.7314369678497314,
+      "learning_rate": 9.996808196502362e-05,
+      "loss": 0.5151,
+      "mean_token_accuracy": 0.8180197477340698,
+      "num_tokens": 855269.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.2783099055290221,
+      "epoch": 0.618860510805501,
+      "grad_norm": 1.6164512634277344,
+      "learning_rate": 9.996201566880935e-05,
+      "loss": 0.4961,
+      "mean_token_accuracy": 0.8200631260871887,
+      "num_tokens": 868735.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.2850772857666015,
+      "epoch": 0.6286836935166994,
+      "grad_norm": 1.5462535619735718,
+      "learning_rate": 9.995542214692418e-05,
+      "loss": 0.5916,
+      "mean_token_accuracy": 0.7909732520580292,
+      "num_tokens": 882232.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.2697942018508912,
+      "epoch": 0.6385068762278978,
+      "grad_norm": 1.9398994445800781,
+      "learning_rate": 9.99483014689464e-05,
+      "loss": 0.5054,
+      "mean_token_accuracy": 0.8184501647949218,
+      "num_tokens": 895363.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.3003694057464599,
+      "epoch": 0.6483300589390962,
+      "grad_norm": 1.6913245916366577,
+      "learning_rate": 9.994065371001724e-05,
+      "loss": 0.5658,
+      "mean_token_accuracy": 0.7982197999954224,
+      "num_tokens": 909912.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.3075840830802918,
+      "epoch": 0.6581532416502947,
+      "grad_norm": 1.5393342971801758,
+      "learning_rate": 9.993247895083988e-05,
+      "loss": 0.574,
+      "mean_token_accuracy": 0.7920112848281861,
+      "num_tokens": 923818.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.2735092639923096,
+      "epoch": 0.6679764243614931,
+      "grad_norm": 1.6885005235671997,
+      "learning_rate": 9.99237772776787e-05,
+      "loss": 0.539,
+      "mean_token_accuracy": 0.7991899967193603,
+      "num_tokens": 937453.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.2649645924568176,
+      "epoch": 0.6777996070726916,
+      "grad_norm": 1.463413953781128,
+      "learning_rate": 9.991454878235837e-05,
+      "loss": 0.5361,
+      "mean_token_accuracy": 0.8108624756336212,
+      "num_tokens": 950998.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.2696751356124878,
+      "epoch": 0.68762278978389,
+      "grad_norm": 1.9994075298309326,
+      "learning_rate": 9.990479356226288e-05,
+      "loss": 0.5365,
+      "mean_token_accuracy": 0.8130120277404785,
+      "num_tokens": 964386.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.285075318813324,
+      "epoch": 0.6974459724950884,
+      "grad_norm": 1.7897218465805054,
+      "learning_rate": 9.989451172033447e-05,
+      "loss": 0.5871,
+      "mean_token_accuracy": 0.7820332407951355,
+      "num_tokens": 978060.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.2756176710128784,
+      "epoch": 0.7072691552062869,
+      "grad_norm": 1.7371011972427368,
+      "learning_rate": 9.98837033650726e-05,
+      "loss": 0.5597,
+      "mean_token_accuracy": 0.7937583506107331,
+      "num_tokens": 991672.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.2849397659301758,
+      "epoch": 0.7170923379174853,
+      "grad_norm": 1.641719937324524,
+      "learning_rate": 9.987236861053274e-05,
+      "loss": 0.5843,
+      "mean_token_accuracy": 0.7939905822277069,
+      "num_tokens": 1005457.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.2925720930099487,
+      "epoch": 0.7269155206286837,
+      "grad_norm": 1.8728162050247192,
+      "learning_rate": 9.986050757632525e-05,
+      "loss": 0.5755,
+      "mean_token_accuracy": 0.7945402979850769,
+      "num_tokens": 1019406.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.291877806186676,
+      "epoch": 0.7367387033398821,
+      "grad_norm": 1.6056290864944458,
+      "learning_rate": 9.984812038761405e-05,
+      "loss": 0.6116,
+      "mean_token_accuracy": 0.7776188969612121,
+      "num_tokens": 1032927.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.3065629363059998,
+      "epoch": 0.7465618860510805,
+      "grad_norm": 1.4939526319503784,
+      "learning_rate": 9.983520717511529e-05,
+      "loss": 0.6408,
+      "mean_token_accuracy": 0.7670970022678375,
+      "num_tokens": 1045833.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.282605803012848,
+      "epoch": 0.756385068762279,
+      "grad_norm": 1.7990894317626953,
+      "learning_rate": 9.982176807509607e-05,
+      "loss": 0.5696,
+      "mean_token_accuracy": 0.7870432496070862,
+      "num_tokens": 1059607.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.2711770296096803,
+      "epoch": 0.7662082514734774,
+      "grad_norm": 1.559313416481018,
+      "learning_rate": 9.980780322937287e-05,
+      "loss": 0.5315,
+      "mean_token_accuracy": 0.8101322710514068,
+      "num_tokens": 1073031.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.2708292603492737,
+      "epoch": 0.7760314341846758,
+      "grad_norm": 1.5710434913635254,
+      "learning_rate": 9.979331278531016e-05,
+      "loss": 0.5539,
+      "mean_token_accuracy": 0.8038661122322083,
+      "num_tokens": 1086355.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.3074462771415711,
+      "epoch": 0.7858546168958742,
+      "grad_norm": 1.5835829973220825,
+      "learning_rate": 9.977829689581877e-05,
+      "loss": 0.6236,
+      "mean_token_accuracy": 0.7792715787887573,
+      "num_tokens": 1100429.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.296637237071991,
+      "epoch": 0.7956777996070727,
+      "grad_norm": 1.571682095527649,
+      "learning_rate": 9.976275571935435e-05,
+      "loss": 0.5913,
+      "mean_token_accuracy": 0.7940649032592774,
+      "num_tokens": 1114663.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.294349157810211,
+      "epoch": 0.8055009823182712,
+      "grad_norm": 1.4109233617782593,
+      "learning_rate": 9.974668941991561e-05,
+      "loss": 0.6248,
+      "mean_token_accuracy": 0.7714365422725677,
+      "num_tokens": 1128403.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.319439172744751,
+      "epoch": 0.8153241650294696,
+      "grad_norm": 1.6622600555419922,
+      "learning_rate": 9.973009816704267e-05,
+      "loss": 0.6399,
+      "mean_token_accuracy": 0.774699580669403,
+      "num_tokens": 1142331.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.302856945991516,
+      "epoch": 0.825147347740668,
+      "grad_norm": 2.4136300086975098,
+      "learning_rate": 9.971298213581522e-05,
+      "loss": 0.5716,
+      "mean_token_accuracy": 0.7897345960140228,
+      "num_tokens": 1156242.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.287725281715393,
+      "epoch": 0.8349705304518664,
+      "grad_norm": 1.7344591617584229,
+      "learning_rate": 9.96953415068507e-05,
+      "loss": 0.5818,
+      "mean_token_accuracy": 0.7853143334388732,
+      "num_tokens": 1170390.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.258513343334198,
+      "epoch": 0.8447937131630648,
+      "grad_norm": 1.8267909288406372,
+      "learning_rate": 9.967717646630235e-05,
+      "loss": 0.5366,
+      "mean_token_accuracy": 0.8063280463218689,
+      "num_tokens": 1183788.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.2829570293426513,
+      "epoch": 0.8546168958742633,
+      "grad_norm": 1.7665373086929321,
+      "learning_rate": 9.965848720585734e-05,
+      "loss": 0.5489,
+      "mean_token_accuracy": 0.7993226885795593,
+      "num_tokens": 1197326.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.3128790736198426,
+      "epoch": 0.8644400785854617,
+      "grad_norm": 1.6902852058410645,
+      "learning_rate": 9.963927392273462e-05,
+      "loss": 0.6228,
+      "mean_token_accuracy": 0.7713834345340729,
+      "num_tokens": 1211392.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.3291242480278016,
+      "epoch": 0.8742632612966601,
+      "grad_norm": 2.342031240463257,
+      "learning_rate": 9.961953681968297e-05,
+      "loss": 0.6504,
+      "mean_token_accuracy": 0.7650188624858856,
+      "num_tokens": 1225560.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.3011715769767762,
+      "epoch": 0.8840864440078585,
+      "grad_norm": 2.45995831489563,
+      "learning_rate": 9.959927610497874e-05,
+      "loss": 0.617,
+      "mean_token_accuracy": 0.7712440609931945,
+      "num_tokens": 1239533.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.2857048749923705,
+      "epoch": 0.8939096267190569,
+      "grad_norm": 1.9025800228118896,
+      "learning_rate": 9.957849199242374e-05,
+      "loss": 0.5763,
+      "mean_token_accuracy": 0.787699168920517,
+      "num_tokens": 1253319.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.2860328674316406,
+      "epoch": 0.9037328094302554,
+      "grad_norm": 1.6897069215774536,
+      "learning_rate": 9.955718470134295e-05,
+      "loss": 0.5671,
+      "mean_token_accuracy": 0.8008992373943329,
+      "num_tokens": 1266631.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.2990724086761474,
+      "epoch": 0.9135559921414538,
+      "grad_norm": 1.771283745765686,
+      "learning_rate": 9.953535445658218e-05,
+      "loss": 0.6136,
+      "mean_token_accuracy": 0.7856487035751343,
+      "num_tokens": 1280141.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.2943416357040405,
+      "epoch": 0.9233791748526523,
+      "grad_norm": 2.1040866374969482,
+      "learning_rate": 9.951300148850576e-05,
+      "loss": 0.5738,
+      "mean_token_accuracy": 0.7893698453903198,
+      "num_tokens": 1294070.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.3020179510116576,
+      "epoch": 0.9332023575638507,
+      "grad_norm": 1.752038836479187,
+      "learning_rate": 9.949012603299404e-05,
+      "loss": 0.5919,
+      "mean_token_accuracy": 0.7839440703392029,
+      "num_tokens": 1308196.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.3016840934753418,
+      "epoch": 0.9430255402750491,
+      "grad_norm": 1.8453019857406616,
+      "learning_rate": 9.946672833144097e-05,
+      "loss": 0.5754,
+      "mean_token_accuracy": 0.7904948055744171,
+      "num_tokens": 1322261.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.2948450207710267,
+      "epoch": 0.9528487229862476,
+      "grad_norm": 1.617616891860962,
+      "learning_rate": 9.944280863075148e-05,
+      "loss": 0.5965,
+      "mean_token_accuracy": 0.7802974283695221,
+      "num_tokens": 1336225.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.302824819087982,
+      "epoch": 0.962671905697446,
+      "grad_norm": 1.8175745010375977,
+      "learning_rate": 9.941836718333894e-05,
+      "loss": 0.6292,
+      "mean_token_accuracy": 0.7750960767269135,
+      "num_tokens": 1349509.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.2981536149978639,
+      "epoch": 0.9724950884086444,
+      "grad_norm": 1.411348581314087,
+      "learning_rate": 9.939340424712247e-05,
+      "loss": 0.5127,
+      "mean_token_accuracy": 0.8160092115402222,
+      "num_tokens": 1363443.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.321254277229309,
+      "epoch": 0.9823182711198428,
+      "grad_norm": 1.8093916177749634,
+      "learning_rate": 9.936792008552418e-05,
+      "loss": 0.6142,
+      "mean_token_accuracy": 0.7801197230815887,
+      "num_tokens": 1377815.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.2920042157173157,
+      "epoch": 0.9921414538310412,
+      "grad_norm": 2.154522657394409,
+      "learning_rate": 9.934191496746647e-05,
+      "loss": 0.5433,
+      "mean_token_accuracy": 0.7936553716659546,
+      "num_tokens": 1391706.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.3085604310035706,
+      "epoch": 1.0019646365422397,
+      "grad_norm": 1.887580394744873,
+      "learning_rate": 9.931538916736911e-05,
+      "loss": 0.5834,
+      "mean_token_accuracy": 0.7798990666866302,
+      "num_tokens": 1405013.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.2716426730155945,
+      "epoch": 1.0117878192534382,
+      "grad_norm": 2.3578274250030518,
+      "learning_rate": 9.928834296514642e-05,
+      "loss": 0.4712,
+      "mean_token_accuracy": 0.8309662401676178,
+      "num_tokens": 1418903.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.3070968985557556,
+      "epoch": 1.0216110019646365,
+      "grad_norm": 1.740882396697998,
+      "learning_rate": 9.926077664620425e-05,
+      "loss": 0.5571,
+      "mean_token_accuracy": 0.8008412003517151,
+      "num_tokens": 1432163.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.275494432449341,
+      "epoch": 1.031434184675835,
+      "grad_norm": 1.8797022104263306,
+      "learning_rate": 9.923269050143702e-05,
+      "loss": 0.513,
+      "mean_token_accuracy": 0.8186901092529297,
+      "num_tokens": 1445781.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.2804364919662476,
+      "epoch": 1.0412573673870333,
+      "grad_norm": 2.1894519329071045,
+      "learning_rate": 9.920408482722461e-05,
+      "loss": 0.5092,
+      "mean_token_accuracy": 0.8133958995342254,
+      "num_tokens": 1460380.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.2710145592689515,
+      "epoch": 1.0510805500982319,
+      "grad_norm": 3.175863742828369,
+      "learning_rate": 9.917495992542925e-05,
+      "loss": 0.5161,
+      "mean_token_accuracy": 0.813822203874588,
+      "num_tokens": 1474285.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.2840298771858216,
+      "epoch": 1.0609037328094302,
+      "grad_norm": 2.1368906497955322,
+      "learning_rate": 9.914531610339235e-05,
+      "loss": 0.5541,
+      "mean_token_accuracy": 0.8022448778152466,
+      "num_tokens": 1487669.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.2852188110351563,
+      "epoch": 1.0707269155206287,
+      "grad_norm": 2.0913002490997314,
+      "learning_rate": 9.911515367393122e-05,
+      "loss": 0.5708,
+      "mean_token_accuracy": 0.8023091912269592,
+      "num_tokens": 1501959.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.2778987884521484,
+      "epoch": 1.080550098231827,
+      "grad_norm": 2.210040807723999,
+      "learning_rate": 9.908447295533583e-05,
+      "loss": 0.5464,
+      "mean_token_accuracy": 0.7970520079135894,
+      "num_tokens": 1516171.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.236346960067749,
+      "epoch": 1.0903732809430255,
+      "grad_norm": 1.4377529621124268,
+      "learning_rate": 9.905327427136535e-05,
+      "loss": 0.4655,
+      "mean_token_accuracy": 0.8350913822650909,
+      "num_tokens": 1529474.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.267558765411377,
+      "epoch": 1.1001964636542239,
+      "grad_norm": 2.254699945449829,
+      "learning_rate": 9.902155795124486e-05,
+      "loss": 0.5446,
+      "mean_token_accuracy": 0.8079259395599365,
+      "num_tokens": 1543309.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.2463451504707337,
+      "epoch": 1.1100196463654224,
+      "grad_norm": 1.5955703258514404,
+      "learning_rate": 9.898932432966174e-05,
+      "loss": 0.4809,
+      "mean_token_accuracy": 0.8256923139095307,
+      "num_tokens": 1557446.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.248048484325409,
+      "epoch": 1.119842829076621,
+      "grad_norm": 1.6384292840957642,
+      "learning_rate": 9.89565737467623e-05,
+      "loss": 0.4743,
+      "mean_token_accuracy": 0.8341399788856506,
+      "num_tokens": 1571146.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.2607948899269104,
+      "epoch": 1.1296660117878192,
+      "grad_norm": 1.8771889209747314,
+      "learning_rate": 9.892330654814803e-05,
+      "loss": 0.5011,
+      "mean_token_accuracy": 0.813750559091568,
+      "num_tokens": 1585417.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.233643364906311,
+      "epoch": 1.1394891944990178,
+      "grad_norm": 2.2473816871643066,
+      "learning_rate": 9.888952308487203e-05,
+      "loss": 0.4554,
+      "mean_token_accuracy": 0.8318858861923217,
+      "num_tokens": 1598975.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.238119614124298,
+      "epoch": 1.149312377210216,
+      "grad_norm": 2.5443625450134277,
+      "learning_rate": 9.885522371343532e-05,
+      "loss": 0.4779,
+      "mean_token_accuracy": 0.832828551530838,
+      "num_tokens": 1612637.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.2466851353645325,
+      "epoch": 1.1591355599214146,
+      "grad_norm": 2.026611804962158,
+      "learning_rate": 9.882040879578304e-05,
+      "loss": 0.5372,
+      "mean_token_accuracy": 0.805012685060501,
+      "num_tokens": 1626057.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.2370510578155518,
+      "epoch": 1.168958742632613,
+      "grad_norm": 1.9643908739089966,
+      "learning_rate": 9.878507869930067e-05,
+      "loss": 0.4996,
+      "mean_token_accuracy": 0.8205065846443176,
+      "num_tokens": 1639457.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.2587358474731445,
+      "epoch": 1.1787819253438114,
+      "grad_norm": 2.439802646636963,
+      "learning_rate": 9.874923379681009e-05,
+      "loss": 0.5101,
+      "mean_token_accuracy": 0.8154071569442749,
+      "num_tokens": 1653028.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.245667052268982,
+      "epoch": 1.1886051080550097,
+      "grad_norm": 1.7865586280822754,
+      "learning_rate": 9.871287446656574e-05,
+      "loss": 0.4969,
+      "mean_token_accuracy": 0.8192065060138702,
+      "num_tokens": 1666720.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.2510582208633423,
+      "epoch": 1.1984282907662083,
+      "grad_norm": 2.783156633377075,
+      "learning_rate": 9.867600109225052e-05,
+      "loss": 0.5276,
+      "mean_token_accuracy": 0.8083367943763733,
+      "num_tokens": 1680530.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.2682287335395812,
+      "epoch": 1.2082514734774068,
+      "grad_norm": 2.4126596450805664,
+      "learning_rate": 9.863861406297186e-05,
+      "loss": 0.5983,
+      "mean_token_accuracy": 0.7770793735980988,
+      "num_tokens": 1694339.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.2660762786865234,
+      "epoch": 1.218074656188605,
+      "grad_norm": 1.9345526695251465,
+      "learning_rate": 9.860071377325744e-05,
+      "loss": 0.4965,
+      "mean_token_accuracy": 0.8184070110321044,
+      "num_tokens": 1708197.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.255600094795227,
+      "epoch": 1.2278978388998034,
+      "grad_norm": 2.6998202800750732,
+      "learning_rate": 9.856230062305127e-05,
+      "loss": 0.5139,
+      "mean_token_accuracy": 0.8155104756355286,
+      "num_tokens": 1722214.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.225659394264221,
+      "epoch": 1.237721021611002,
+      "grad_norm": 1.7819828987121582,
+      "learning_rate": 9.852337501770923e-05,
+      "loss": 0.4564,
+      "mean_token_accuracy": 0.8367070078849792,
+      "num_tokens": 1736088.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.2524644374847411,
+      "epoch": 1.2475442043222005,
+      "grad_norm": 1.3680812120437622,
+      "learning_rate": 9.848393736799496e-05,
+      "loss": 0.5215,
+      "mean_token_accuracy": 0.808636051416397,
+      "num_tokens": 1749496.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.257518184185028,
+      "epoch": 1.2573673870333988,
+      "grad_norm": 2.3655996322631836,
+      "learning_rate": 9.844398809007545e-05,
+      "loss": 0.5431,
+      "mean_token_accuracy": 0.7984029889106751,
+      "num_tokens": 1763423.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.2404034614562989,
+      "epoch": 1.2671905697445973,
+      "grad_norm": 2.355236053466797,
+      "learning_rate": 9.840352760551663e-05,
+      "loss": 0.5024,
+      "mean_token_accuracy": 0.8227346181869507,
+      "num_tokens": 1777421.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.2580087780952454,
+      "epoch": 1.2770137524557956,
+      "grad_norm": 2.0372347831726074,
+      "learning_rate": 9.836255634127899e-05,
+      "loss": 0.4846,
+      "mean_token_accuracy": 0.8213749349117279,
+      "num_tokens": 1792763.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.2565680623054505,
+      "epoch": 1.2868369351669942,
+      "grad_norm": 2.134438991546631,
+      "learning_rate": 9.832107472971304e-05,
+      "loss": 0.5592,
+      "mean_token_accuracy": 0.7969561636447906,
+      "num_tokens": 1806941.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.246485674381256,
+      "epoch": 1.2966601178781925,
+      "grad_norm": 2.4246816635131836,
+      "learning_rate": 9.82790832085547e-05,
+      "loss": 0.4951,
+      "mean_token_accuracy": 0.82363041639328,
+      "num_tokens": 1820836.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.2375126361846924,
+      "epoch": 1.306483300589391,
+      "grad_norm": 2.738617420196533,
+      "learning_rate": 9.823658222092081e-05,
+      "loss": 0.5006,
+      "mean_token_accuracy": 0.8225988149642944,
+      "num_tokens": 1833250.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.24787095785141,
+      "epoch": 1.3163064833005893,
+      "grad_norm": 1.8391460180282593,
+      "learning_rate": 9.819357221530425e-05,
+      "loss": 0.4972,
+      "mean_token_accuracy": 0.8270265519618988,
+      "num_tokens": 1847343.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.2579961180686952,
+      "epoch": 1.3261296660117878,
+      "grad_norm": 1.4534238576889038,
+      "learning_rate": 9.815005364556946e-05,
+      "loss": 0.4954,
+      "mean_token_accuracy": 0.8219050347805024,
+      "num_tokens": 1861600.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.2523591637611389,
+      "epoch": 1.3359528487229864,
+      "grad_norm": 2.286081075668335,
+      "learning_rate": 9.810602697094742e-05,
+      "loss": 0.528,
+      "mean_token_accuracy": 0.8146000444889069,
+      "num_tokens": 1875384.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.263988447189331,
+      "epoch": 1.3457760314341847,
+      "grad_norm": 1.7268867492675781,
+      "learning_rate": 9.806149265603096e-05,
+      "loss": 0.496,
+      "mean_token_accuracy": 0.8314105927944183,
+      "num_tokens": 1888118.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.2864573359489442,
+      "epoch": 1.355599214145383,
+      "grad_norm": 1.9473626613616943,
+      "learning_rate": 9.801645117076972e-05,
+      "loss": 0.6076,
+      "mean_token_accuracy": 0.7793005406856537,
+      "num_tokens": 1901478.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.2580334901809693,
+      "epoch": 1.3654223968565815,
+      "grad_norm": 2.5675950050354004,
+      "learning_rate": 9.797090299046539e-05,
+      "loss": 0.5201,
+      "mean_token_accuracy": 0.814705616235733,
+      "num_tokens": 1914649.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.2844634294509887,
+      "epoch": 1.37524557956778,
+      "grad_norm": 2.2183785438537598,
+      "learning_rate": 9.792484859576648e-05,
+      "loss": 0.5447,
+      "mean_token_accuracy": 0.8052274882793427,
+      "num_tokens": 1928074.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.2458331108093261,
+      "epoch": 1.3850687622789783,
+      "grad_norm": 1.9489785432815552,
+      "learning_rate": 9.787828847266339e-05,
+      "loss": 0.4804,
+      "mean_token_accuracy": 0.829998356103897,
+      "num_tokens": 1941149.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.2675795674324035,
+      "epoch": 1.3948919449901769,
+      "grad_norm": 2.2568233013153076,
+      "learning_rate": 9.783122311248319e-05,
+      "loss": 0.5462,
+      "mean_token_accuracy": 0.802941232919693,
+      "num_tokens": 1955165.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.2623451232910157,
+      "epoch": 1.4047151277013752,
+      "grad_norm": 1.9349223375320435,
+      "learning_rate": 9.778365301188454e-05,
+      "loss": 0.5319,
+      "mean_token_accuracy": 0.8103811144828796,
+      "num_tokens": 1968689.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.2589616417884826,
+      "epoch": 1.4145383104125737,
+      "grad_norm": 2.0692477226257324,
+      "learning_rate": 9.773557867285232e-05,
+      "loss": 0.5031,
+      "mean_token_accuracy": 0.8202072083950043,
+      "num_tokens": 1983147.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.2585302472114563,
+      "epoch": 1.424361493123772,
+      "grad_norm": 2.16162371635437,
+      "learning_rate": 9.768700060269247e-05,
+      "loss": 0.526,
+      "mean_token_accuracy": 0.8141628861427307,
+      "num_tokens": 1996929.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.2599245667457581,
+      "epoch": 1.4341846758349706,
+      "grad_norm": 2.063103437423706,
+      "learning_rate": 9.763791931402652e-05,
+      "loss": 0.4848,
+      "mean_token_accuracy": 0.8278682291507721,
+      "num_tokens": 2010715.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.266039752960205,
+      "epoch": 1.4440078585461689,
+      "grad_norm": 2.109372854232788,
+      "learning_rate": 9.758833532478624e-05,
+      "loss": 0.5816,
+      "mean_token_accuracy": 0.7883775234222412,
+      "num_tokens": 2024736.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.2680623173713683,
+      "epoch": 1.4538310412573674,
+      "grad_norm": 1.812806487083435,
+      "learning_rate": 9.753824915820814e-05,
+      "loss": 0.5118,
+      "mean_token_accuracy": 0.8144886434078217,
+      "num_tokens": 2038954.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.2589596390724183,
+      "epoch": 1.463654223968566,
+      "grad_norm": 1.6400179862976074,
+      "learning_rate": 9.748766134282807e-05,
+      "loss": 0.5195,
+      "mean_token_accuracy": 0.807390421628952,
+      "num_tokens": 2052331.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.244090747833252,
+      "epoch": 1.4734774066797642,
+      "grad_norm": 2.0556066036224365,
+      "learning_rate": 9.743657241247542e-05,
+      "loss": 0.4997,
+      "mean_token_accuracy": 0.8174088597297668,
+      "num_tokens": 2066530.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.250917875766754,
+      "epoch": 1.4833005893909625,
+      "grad_norm": 2.830498218536377,
+      "learning_rate": 9.738498290626764e-05,
+      "loss": 0.4585,
+      "mean_token_accuracy": 0.8412882685661316,
+      "num_tokens": 2079919.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.2326926350593568,
+      "epoch": 1.493123772102161,
+      "grad_norm": 1.8350157737731934,
+      "learning_rate": 9.733289336860458e-05,
+      "loss": 0.5094,
+      "mean_token_accuracy": 0.8157238185405731,
+      "num_tokens": 2094074.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.2600191235542297,
+      "epoch": 1.5029469548133596,
+      "grad_norm": 1.993652105331421,
+      "learning_rate": 9.728030434916266e-05,
+      "loss": 0.5491,
+      "mean_token_accuracy": 0.8047675967216492,
+      "num_tokens": 2107458.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.259787654876709,
+      "epoch": 1.512770137524558,
+      "grad_norm": 2.0996932983398438,
+      "learning_rate": 9.722721640288905e-05,
+      "loss": 0.4725,
+      "mean_token_accuracy": 0.8254640996456146,
+      "num_tokens": 2121435.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.2564135551452638,
+      "epoch": 1.5225933202357562,
+      "grad_norm": 2.216902494430542,
+      "learning_rate": 9.717363008999594e-05,
+      "loss": 0.6043,
+      "mean_token_accuracy": 0.7789163291454315,
+      "num_tokens": 2134901.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.2386685490608216,
+      "epoch": 1.5324165029469548,
+      "grad_norm": 1.8616433143615723,
+      "learning_rate": 9.711954597595446e-05,
+      "loss": 0.4516,
+      "mean_token_accuracy": 0.8284256815910339,
+      "num_tokens": 2149157.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.2334220647811889,
+      "epoch": 1.5422396856581533,
+      "grad_norm": 2.008178234100342,
+      "learning_rate": 9.706496463148888e-05,
+      "loss": 0.484,
+      "mean_token_accuracy": 0.82932368516922,
+      "num_tokens": 2162991.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.2665512442588807,
+      "epoch": 1.5520628683693518,
+      "grad_norm": 2.1565146446228027,
+      "learning_rate": 9.700988663257047e-05,
+      "loss": 0.5353,
+      "mean_token_accuracy": 0.8073971390724182,
+      "num_tokens": 2177097.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.2624645590782166,
+      "epoch": 1.5618860510805501,
+      "grad_norm": 2.7686235904693604,
+      "learning_rate": 9.695431256041147e-05,
+      "loss": 0.5696,
+      "mean_token_accuracy": 0.7965208232402802,
+      "num_tokens": 2190589.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.254404366016388,
+      "epoch": 1.5717092337917484,
+      "grad_norm": 1.9496175050735474,
+      "learning_rate": 9.689824300145893e-05,
+      "loss": 0.5218,
+      "mean_token_accuracy": 0.8181787610054017,
+      "num_tokens": 2204358.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.2377846360206604,
+      "epoch": 1.581532416502947,
+      "grad_norm": 2.1385841369628906,
+      "learning_rate": 9.684167854738857e-05,
+      "loss": 0.4952,
+      "mean_token_accuracy": 0.8231679797172546,
+      "num_tokens": 2218080.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.2657816886901856,
+      "epoch": 1.5913555992141455,
+      "grad_norm": 1.9250800609588623,
+      "learning_rate": 9.678461979509849e-05,
+      "loss": 0.5684,
+      "mean_token_accuracy": 0.796778404712677,
+      "num_tokens": 2231436.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.2508048176765443,
+      "epoch": 1.6011787819253438,
+      "grad_norm": 2.177400827407837,
+      "learning_rate": 9.672706734670289e-05,
+      "loss": 0.4848,
+      "mean_token_accuracy": 0.8291748940944672,
+      "num_tokens": 2245613.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.268570113182068,
+      "epoch": 1.611001964636542,
+      "grad_norm": 2.31485652923584,
+      "learning_rate": 9.66690218095257e-05,
+      "loss": 0.5673,
+      "mean_token_accuracy": 0.7923191308975219,
+      "num_tokens": 2259056.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.2469953656196595,
+      "epoch": 1.6208251473477406,
+      "grad_norm": 2.7683792114257812,
+      "learning_rate": 9.661048379609418e-05,
+      "loss": 0.4902,
+      "mean_token_accuracy": 0.8232687532901763,
+      "num_tokens": 2272855.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.2610564589500428,
+      "epoch": 1.6306483300589392,
+      "grad_norm": 2.1523454189300537,
+      "learning_rate": 9.655145392413251e-05,
+      "loss": 0.5459,
+      "mean_token_accuracy": 0.800895380973816,
+      "num_tokens": 2286678.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.2304104089736938,
+      "epoch": 1.6404715127701375,
+      "grad_norm": 2.4282095432281494,
+      "learning_rate": 9.649193281655518e-05,
+      "loss": 0.4966,
+      "mean_token_accuracy": 0.8258331537246704,
+      "num_tokens": 2299799.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.2416871786117554,
+      "epoch": 1.650294695481336,
+      "grad_norm": 2.069984197616577,
+      "learning_rate": 9.643192110146044e-05,
+      "loss": 0.4981,
+      "mean_token_accuracy": 0.8195957362651825,
+      "num_tokens": 2314237.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.2401619911193849,
+      "epoch": 1.6601178781925343,
+      "grad_norm": 2.3762688636779785,
+      "learning_rate": 9.637141941212374e-05,
+      "loss": 0.5294,
+      "mean_token_accuracy": 0.8102250933647156,
+      "num_tokens": 2327772.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.2495620012283326,
+      "epoch": 1.6699410609037328,
+      "grad_norm": 2.802684783935547,
+      "learning_rate": 9.631042838699096e-05,
+      "loss": 0.5473,
+      "mean_token_accuracy": 0.8017067730426788,
+      "num_tokens": 2341503.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.2424279928207398,
+      "epoch": 1.6797642436149314,
+      "grad_norm": 1.5717180967330933,
+      "learning_rate": 9.624894866967174e-05,
+      "loss": 0.5288,
+      "mean_token_accuracy": 0.8105488240718841,
+      "num_tokens": 2355368.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.241310751438141,
+      "epoch": 1.6895874263261297,
+      "grad_norm": 2.1303505897521973,
+      "learning_rate": 9.618698090893263e-05,
+      "loss": 0.5075,
+      "mean_token_accuracy": 0.8153672814369202,
+      "num_tokens": 2369658.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.2365522384643555,
+      "epoch": 1.699410609037328,
+      "grad_norm": 2.0560405254364014,
+      "learning_rate": 9.612452575869028e-05,
+      "loss": 0.5393,
+      "mean_token_accuracy": 0.7969626665115357,
+      "num_tokens": 2382990.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.2362038850784303,
+      "epoch": 1.7092337917485265,
+      "grad_norm": 1.9128539562225342,
+      "learning_rate": 9.606158387800454e-05,
+      "loss": 0.5047,
+      "mean_token_accuracy": 0.8211510121822357,
+      "num_tokens": 2396917.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.2453686594963074,
+      "epoch": 1.719056974459725,
+      "grad_norm": 3.0778567790985107,
+      "learning_rate": 9.599815593107153e-05,
+      "loss": 0.5262,
+      "mean_token_accuracy": 0.8054801046848297,
+      "num_tokens": 2410056.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.241734540462494,
+      "epoch": 1.7288801571709234,
+      "grad_norm": 2.05613112449646,
+      "learning_rate": 9.593424258721653e-05,
+      "loss": 0.4914,
+      "mean_token_accuracy": 0.8227891206741333,
+      "num_tokens": 2423604.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.2589903235435487,
+      "epoch": 1.7387033398821217,
+      "grad_norm": 2.131194829940796,
+      "learning_rate": 9.586984452088703e-05,
+      "loss": 0.5733,
+      "mean_token_accuracy": 0.7925102472305298,
+      "num_tokens": 2437290.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.2547675371170044,
+      "epoch": 1.7485265225933202,
+      "grad_norm": 2.7211906909942627,
+      "learning_rate": 9.580496241164556e-05,
+      "loss": 0.5272,
+      "mean_token_accuracy": 0.8076090633869171,
+      "num_tokens": 2450587.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.2483873605728149,
+      "epoch": 1.7583497053045187,
+      "grad_norm": 2.1691336631774902,
+      "learning_rate": 9.573959694416253e-05,
+      "loss": 0.4981,
+      "mean_token_accuracy": 0.8250246226787568,
+      "num_tokens": 2464803.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.2557496905326844,
+      "epoch": 1.768172888015717,
+      "grad_norm": 2.1679351329803467,
+      "learning_rate": 9.567374880820898e-05,
+      "loss": 0.5053,
+      "mean_token_accuracy": 0.8208329737186432,
+      "num_tokens": 2478469.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.2435495257377625,
+      "epoch": 1.7779960707269156,
+      "grad_norm": 2.2118451595306396,
+      "learning_rate": 9.560741869864938e-05,
+      "loss": 0.4678,
+      "mean_token_accuracy": 0.8377247154712677,
+      "num_tokens": 2492183.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.227329707145691,
+      "epoch": 1.7878192534381139,
+      "grad_norm": 2.1643288135528564,
+      "learning_rate": 9.554060731543415e-05,
+      "loss": 0.4651,
+      "mean_token_accuracy": 0.8346364259719848,
+      "num_tokens": 2505870.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.2532490849494935,
+      "epoch": 1.7976424361493124,
+      "grad_norm": 1.8303252458572388,
+      "learning_rate": 9.547331536359247e-05,
+      "loss": 0.5014,
+      "mean_token_accuracy": 0.8197422027587891,
+      "num_tokens": 2520024.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.271022343635559,
+      "epoch": 1.807465618860511,
+      "grad_norm": 2.9909262657165527,
+      "learning_rate": 9.540554355322466e-05,
+      "loss": 0.4949,
+      "mean_token_accuracy": 0.8201545178890228,
+      "num_tokens": 2533689.0,
+      "step": 1840
+    },
+    {
+      "entropy": 1.2634048223495484,
+      "epoch": 1.8172888015717092,
+      "grad_norm": 1.7734454870224,
+      "learning_rate": 9.533729259949478e-05,
+      "loss": 0.5399,
+      "mean_token_accuracy": 0.8053711652755737,
+      "num_tokens": 2547528.0,
+      "step": 1850
+    },
+    {
+      "entropy": 1.290783977508545,
+      "epoch": 1.8271119842829076,
+      "grad_norm": 2.2786009311676025,
+      "learning_rate": 9.526856322262308e-05,
+      "loss": 0.5792,
+      "mean_token_accuracy": 0.7914485156536102,
+      "num_tokens": 2561664.0,
+      "step": 1860
+    },
+    {
+      "entropy": 1.2498828649520874,
+      "epoch": 1.836935166994106,
+      "grad_norm": 2.993764638900757,
+      "learning_rate": 9.519935614787837e-05,
+      "loss": 0.5043,
+      "mean_token_accuracy": 0.8198988556861877,
+      "num_tokens": 2575398.0,
+      "step": 1870
+    },
+    {
+      "entropy": 1.2753949642181397,
+      "epoch": 1.8467583497053046,
+      "grad_norm": 2.466764211654663,
+      "learning_rate": 9.512967210557038e-05,
+      "loss": 0.5359,
+      "mean_token_accuracy": 0.8064365327358246,
+      "num_tokens": 2589001.0,
+      "step": 1880
+    },
+    {
+      "entropy": 1.2798161029815673,
+      "epoch": 1.856581532416503,
+      "grad_norm": 1.6539644002914429,
+      "learning_rate": 9.505951183104207e-05,
+      "loss": 0.5409,
+      "mean_token_accuracy": 0.8036971032619477,
+      "num_tokens": 2602056.0,
+      "step": 1890
+    },
+    {
+      "entropy": 1.247439169883728,
+      "epoch": 1.8664047151277012,
+      "grad_norm": 2.616790533065796,
+      "learning_rate": 9.498887606466182e-05,
+      "loss": 0.4817,
+      "mean_token_accuracy": 0.8258330225944519,
+      "num_tokens": 2616334.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.2620904803276063,
+      "epoch": 1.8762278978388998,
+      "grad_norm": 2.2307326793670654,
+      "learning_rate": 9.49177655518157e-05,
+      "loss": 0.5128,
+      "mean_token_accuracy": 0.8172138810157776,
+      "num_tokens": 2630707.0,
+      "step": 1910
+    },
+    {
+      "entropy": 1.280482542514801,
+      "epoch": 1.8860510805500983,
+      "grad_norm": 2.6325061321258545,
+      "learning_rate": 9.484618104289952e-05,
+      "loss": 0.5774,
+      "mean_token_accuracy": 0.7905748307704925,
+      "num_tokens": 2644746.0,
+      "step": 1920
+    },
+    {
+      "entropy": 1.2566339731216432,
+      "epoch": 1.8958742632612968,
+      "grad_norm": 2.277913808822632,
+      "learning_rate": 9.4774123293311e-05,
+      "loss": 0.5013,
+      "mean_token_accuracy": 0.8227693378925324,
+      "num_tokens": 2658986.0,
+      "step": 1930
+    },
+    {
+      "entropy": 1.2552205681800843,
+      "epoch": 1.9056974459724951,
+      "grad_norm": 3.0060858726501465,
+      "learning_rate": 9.470159306344165e-05,
+      "loss": 0.511,
+      "mean_token_accuracy": 0.817187237739563,
+      "num_tokens": 2672385.0,
+      "step": 1940
+    },
+    {
+      "entropy": 1.2724160313606263,
+      "epoch": 1.9155206286836934,
+      "grad_norm": 2.1056430339813232,
+      "learning_rate": 9.462859111866891e-05,
+      "loss": 0.5097,
+      "mean_token_accuracy": 0.8144063234329224,
+      "num_tokens": 2686315.0,
+      "step": 1950
+    },
+    {
+      "entropy": 1.2793522596359252,
+      "epoch": 1.925343811394892,
+      "grad_norm": 2.075453519821167,
+      "learning_rate": 9.455511822934802e-05,
+      "loss": 0.5414,
+      "mean_token_accuracy": 0.8034018754959107,
+      "num_tokens": 2700223.0,
+      "step": 1960
+    },
+    {
+      "entropy": 1.2614792227745055,
+      "epoch": 1.9351669941060905,
+      "grad_norm": 2.2977967262268066,
+      "learning_rate": 9.448117517080383e-05,
+      "loss": 0.4827,
+      "mean_token_accuracy": 0.8254502475261688,
+      "num_tokens": 2714485.0,
+      "step": 1970
+    },
+    {
+      "entropy": 1.2860048413276672,
+      "epoch": 1.9449901768172888,
+      "grad_norm": 1.679022192955017,
+      "learning_rate": 9.44067627233227e-05,
+      "loss": 0.5495,
+      "mean_token_accuracy": 0.8007942914962769,
+      "num_tokens": 2727978.0,
+      "step": 1980
+    },
+    {
+      "entropy": 1.2813897252082824,
+      "epoch": 1.9548133595284871,
+      "grad_norm": 2.2625648975372314,
+      "learning_rate": 9.433188167214419e-05,
+      "loss": 0.5245,
+      "mean_token_accuracy": 0.8010579526424408,
+      "num_tokens": 2742267.0,
+      "step": 1990
+    },
+    {
+      "entropy": 1.2723148226737977,
+      "epoch": 1.9646365422396856,
+      "grad_norm": 1.6450095176696777,
+      "learning_rate": 9.425653280745289e-05,
+      "loss": 0.5382,
+      "mean_token_accuracy": 0.8019434869289398,
+      "num_tokens": 2755781.0,
+      "step": 2000
+    },
+    {
+      "entropy": 1.252918040752411,
+      "epoch": 1.9744597249508842,
+      "grad_norm": 2.664660692214966,
+      "learning_rate": 9.418071692436991e-05,
+      "loss": 0.4686,
+      "mean_token_accuracy": 0.834530645608902,
+      "num_tokens": 2769364.0,
+      "step": 2010
+    },
+    {
+      "entropy": 1.2819023251533508,
+      "epoch": 1.9842829076620825,
+      "grad_norm": 2.215744972229004,
+      "learning_rate": 9.410443482294468e-05,
+      "loss": 0.5552,
+      "mean_token_accuracy": 0.802883917093277,
+      "num_tokens": 2782778.0,
+      "step": 2020
+    },
+    {
+      "entropy": 1.2668519973754884,
+      "epoch": 1.9941060903732808,
+      "grad_norm": 1.973852276802063,
+      "learning_rate": 9.402768730814632e-05,
+      "loss": 0.4683,
+      "mean_token_accuracy": 0.8320684492588043,
+      "num_tokens": 2796370.0,
+      "step": 2030
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10180,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1930547557440512e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca00661b4201b9c900ba613719f42e2216580f8bd1d0e3994cb00560554804cf
+size 6481

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-2036/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4874ce4daecc23f55bfbb117c7ac489e347cad968b9e42a2044a48a6b4dac404
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-3054/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896