agu18dec commited on
Commit
e968c5f
·
verified ·
1 Parent(s): c24b69b

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/README.md +61 -0
  3. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_config.json +48 -0
  4. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_model.safetensors +3 -0
  5. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/added_tokens.json +24 -0
  6. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/chat_template.jinja +54 -0
  7. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/README.md +209 -0
  8. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_config.json +48 -0
  9. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_model.safetensors +3 -0
  10. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/added_tokens.json +24 -0
  11. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/chat_template.jinja +54 -0
  12. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/merges.txt +0 -0
  13. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/special_tokens_map.json +31 -0
  14. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json +3 -0
  15. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer_config.json +207 -0
  16. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/trainer_state.json +1874 -0
  17. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/training_args.bin +3 -0
  18. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/vocab.json +0 -0
  19. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/README.md +209 -0
  20. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_config.json +48 -0
  21. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_model.safetensors +3 -0
  22. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/added_tokens.json +24 -0
  23. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/chat_template.jinja +54 -0
  24. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/merges.txt +0 -0
  25. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/special_tokens_map.json +31 -0
  26. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json +3 -0
  27. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer_config.json +207 -0
  28. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/trainer_state.json +2794 -0
  29. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/training_args.bin +3 -0
  30. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/vocab.json +0 -0
  31. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/README.md +209 -0
  32. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_config.json +48 -0
  33. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_model.safetensors +3 -0
  34. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/added_tokens.json +24 -0
  35. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/chat_template.jinja +54 -0
  36. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/merges.txt +0 -0
  37. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/special_tokens_map.json +31 -0
  38. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json +3 -0
  39. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer_config.json +207 -0
  40. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/trainer_state.json +0 -0
  41. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/training_args.bin +3 -0
  42. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/vocab.json +0 -0
  43. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/README.md +209 -0
  44. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_config.json +48 -0
  45. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_model.safetensors +3 -0
  46. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/added_tokens.json +24 -0
  47. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/chat_template.jinja +54 -0
  48. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/merges.txt +0 -0
  49. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/special_tokens_map.json +31 -0
  50. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -320,3 +320,14 @@ checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_baseline/tokenizer.json f
320
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2241/tokenizer.json filter=lfs diff=lfs merge=lfs -text
321
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2490/tokenizer.json filter=lfs diff=lfs merge=lfs -text
322
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
320
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2241/tokenizer.json filter=lfs diff=lfs merge=lfs -text
321
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/checkpoint-2490/tokenizer.json filter=lfs diff=lfs merge=lfs -text
322
  checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L33_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
323
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json filter=lfs diff=lfs merge=lfs -text
324
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json filter=lfs diff=lfs merge=lfs -text
325
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json filter=lfs diff=lfs merge=lfs -text
326
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
327
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-5520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
328
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-6440/tokenizer.json filter=lfs diff=lfs merge=lfs -text
329
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-7360/tokenizer.json filter=lfs diff=lfs merge=lfs -text
330
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-8280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
331
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-920/tokenizer.json filter=lfs diff=lfs merge=lfs -text
332
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
333
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/4bc95c7e)
34
+
35
+
36
+ This model was trained with SFT.
37
+
38
+ ### Framework versions
39
+
40
+ - PEFT 0.19.1
41
+ - TRL: 0.28.0
42
+ - Transformers: 4.57.6
43
+ - Pytorch: 2.9.1
44
+ - Datasets: 4.5.0
45
+ - Tokenizers: 0.22.2
46
+
47
+ ## Citations
48
+
49
+
50
+
51
+ Cite TRL as:
52
+
53
+ ```bibtex
54
+ @software{vonwerra2020trl,
55
+ title = {{TRL: Transformers Reinforcement Learning}},
56
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
57
+ license = {Apache-2.0},
58
+ url = {https://github.com/huggingface/trl},
59
+ year = {2020}
60
+ }
61
+ ```
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0523104214bf52665e06f3d4f2914a483131c9fc5d65947216d43af9c5a3c78
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7089da8d9f7f02bd579baee830e61c329d1ed80ef19e391eec66c111cc675e
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/trainer_state.json ADDED
@@ -0,0 +1,1874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1840,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2980000615119933,
14
+ "epoch": 0.010869565217391304,
15
+ "grad_norm": 7.514286994934082,
16
+ "learning_rate": 1.956521739130435e-06,
17
+ "loss": 1.8548,
18
+ "mean_token_accuracy": 0.5365569293498993,
19
+ "num_tokens": 13273.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.3184159398078918,
24
+ "epoch": 0.021739130434782608,
25
+ "grad_norm": 6.582128524780273,
26
+ "learning_rate": 4.130434782608695e-06,
27
+ "loss": 1.9416,
28
+ "mean_token_accuracy": 0.5010036021471024,
29
+ "num_tokens": 26299.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.302778995037079,
34
+ "epoch": 0.03260869565217391,
35
+ "grad_norm": 6.661994457244873,
36
+ "learning_rate": 6.304347826086957e-06,
37
+ "loss": 1.7644,
38
+ "mean_token_accuracy": 0.5327741354703903,
39
+ "num_tokens": 39608.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.355496096611023,
44
+ "epoch": 0.043478260869565216,
45
+ "grad_norm": 2.829239845275879,
46
+ "learning_rate": 8.478260869565217e-06,
47
+ "loss": 1.5473,
48
+ "mean_token_accuracy": 0.5216561764478683,
49
+ "num_tokens": 52279.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.4233315467834473,
54
+ "epoch": 0.05434782608695652,
55
+ "grad_norm": 1.384964108467102,
56
+ "learning_rate": 1.0652173913043479e-05,
57
+ "loss": 1.3512,
58
+ "mean_token_accuracy": 0.5346131652593613,
59
+ "num_tokens": 65371.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.4783715605735779,
64
+ "epoch": 0.06521739130434782,
65
+ "grad_norm": 1.2184863090515137,
66
+ "learning_rate": 1.2826086956521741e-05,
67
+ "loss": 1.3353,
68
+ "mean_token_accuracy": 0.5265826016664505,
69
+ "num_tokens": 78549.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.4811665654182433,
74
+ "epoch": 0.07608695652173914,
75
+ "grad_norm": 0.8817082047462463,
76
+ "learning_rate": 1.5e-05,
77
+ "loss": 1.2885,
78
+ "mean_token_accuracy": 0.5369167566299439,
79
+ "num_tokens": 91168.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.485330879688263,
84
+ "epoch": 0.08695652173913043,
85
+ "grad_norm": 1.0375007390975952,
86
+ "learning_rate": 1.7173913043478263e-05,
87
+ "loss": 1.3207,
88
+ "mean_token_accuracy": 0.5182694345712662,
89
+ "num_tokens": 104210.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.4509355902671814,
94
+ "epoch": 0.09782608695652174,
95
+ "grad_norm": 0.866616427898407,
96
+ "learning_rate": 1.9347826086956523e-05,
97
+ "loss": 1.2442,
98
+ "mean_token_accuracy": 0.5508454263210296,
99
+ "num_tokens": 117342.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.4595998883247376,
104
+ "epoch": 0.10869565217391304,
105
+ "grad_norm": 0.9921526312828064,
106
+ "learning_rate": 2.1521739130434784e-05,
107
+ "loss": 1.2513,
108
+ "mean_token_accuracy": 0.5439675092697144,
109
+ "num_tokens": 130168.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.468259596824646,
114
+ "epoch": 0.11956521739130435,
115
+ "grad_norm": 0.8542688488960266,
116
+ "learning_rate": 2.3695652173913045e-05,
117
+ "loss": 1.2523,
118
+ "mean_token_accuracy": 0.5456153243780136,
119
+ "num_tokens": 143277.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.4652462244033813,
124
+ "epoch": 0.13043478260869565,
125
+ "grad_norm": 0.8958607316017151,
126
+ "learning_rate": 2.5869565217391305e-05,
127
+ "loss": 1.2564,
128
+ "mean_token_accuracy": 0.5374186933040619,
129
+ "num_tokens": 155929.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.4442671895027162,
134
+ "epoch": 0.14130434782608695,
135
+ "grad_norm": 1.0437828302383423,
136
+ "learning_rate": 2.8043478260869566e-05,
137
+ "loss": 1.2463,
138
+ "mean_token_accuracy": 0.5506911396980285,
139
+ "num_tokens": 168922.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.442794382572174,
144
+ "epoch": 0.15217391304347827,
145
+ "grad_norm": 1.1950273513793945,
146
+ "learning_rate": 3.0217391304347827e-05,
147
+ "loss": 1.2343,
148
+ "mean_token_accuracy": 0.561489287018776,
149
+ "num_tokens": 181883.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.4483441829681396,
154
+ "epoch": 0.16304347826086957,
155
+ "grad_norm": 1.27411687374115,
156
+ "learning_rate": 3.239130434782609e-05,
157
+ "loss": 1.2515,
158
+ "mean_token_accuracy": 0.5461658954620361,
159
+ "num_tokens": 194847.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.4531602740287781,
164
+ "epoch": 0.17391304347826086,
165
+ "grad_norm": 0.9844512343406677,
166
+ "learning_rate": 3.456521739130435e-05,
167
+ "loss": 1.2379,
168
+ "mean_token_accuracy": 0.5450588703155518,
169
+ "num_tokens": 207431.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.4460819005966186,
174
+ "epoch": 0.18478260869565216,
175
+ "grad_norm": 0.965182363986969,
176
+ "learning_rate": 3.673913043478261e-05,
177
+ "loss": 1.2497,
178
+ "mean_token_accuracy": 0.5442000389099121,
179
+ "num_tokens": 220382.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.4696384787559509,
184
+ "epoch": 0.1956521739130435,
185
+ "grad_norm": 0.8425037860870361,
186
+ "learning_rate": 3.8913043478260866e-05,
187
+ "loss": 1.2847,
188
+ "mean_token_accuracy": 0.5304420441389084,
189
+ "num_tokens": 232940.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.4491984724998475,
194
+ "epoch": 0.20652173913043478,
195
+ "grad_norm": 1.1692280769348145,
196
+ "learning_rate": 4.1086956521739134e-05,
197
+ "loss": 1.2342,
198
+ "mean_token_accuracy": 0.5570813834667205,
199
+ "num_tokens": 245747.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.466271436214447,
204
+ "epoch": 0.21739130434782608,
205
+ "grad_norm": 1.0157368183135986,
206
+ "learning_rate": 4.3260869565217394e-05,
207
+ "loss": 1.2499,
208
+ "mean_token_accuracy": 0.5432725459337234,
209
+ "num_tokens": 258696.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.4768565893173218,
214
+ "epoch": 0.22826086956521738,
215
+ "grad_norm": 1.109692096710205,
216
+ "learning_rate": 4.5434782608695655e-05,
217
+ "loss": 1.2343,
218
+ "mean_token_accuracy": 0.5567020237445831,
219
+ "num_tokens": 271378.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.4473167181015014,
224
+ "epoch": 0.2391304347826087,
225
+ "grad_norm": 0.850563108921051,
226
+ "learning_rate": 4.7608695652173916e-05,
227
+ "loss": 1.1959,
228
+ "mean_token_accuracy": 0.5724921762943268,
229
+ "num_tokens": 284704.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.4478083968162536,
234
+ "epoch": 0.25,
235
+ "grad_norm": 1.0289748907089233,
236
+ "learning_rate": 4.9782608695652176e-05,
237
+ "loss": 1.2392,
238
+ "mean_token_accuracy": 0.5519216269254684,
239
+ "num_tokens": 296961.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.4915278434753418,
244
+ "epoch": 0.2608695652173913,
245
+ "grad_norm": 1.3161778450012207,
246
+ "learning_rate": 5.195652173913044e-05,
247
+ "loss": 1.2539,
248
+ "mean_token_accuracy": 0.5443875581026077,
249
+ "num_tokens": 310082.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.4435262322425841,
254
+ "epoch": 0.2717391304347826,
255
+ "grad_norm": 1.2697113752365112,
256
+ "learning_rate": 5.41304347826087e-05,
257
+ "loss": 1.1911,
258
+ "mean_token_accuracy": 0.576522421836853,
259
+ "num_tokens": 323044.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.4607349276542663,
264
+ "epoch": 0.2826086956521739,
265
+ "grad_norm": 0.8006339073181152,
266
+ "learning_rate": 5.630434782608696e-05,
267
+ "loss": 1.2088,
268
+ "mean_token_accuracy": 0.5584357857704163,
269
+ "num_tokens": 336108.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.4630830883979797,
274
+ "epoch": 0.29347826086956524,
275
+ "grad_norm": 0.8462095856666565,
276
+ "learning_rate": 5.847826086956521e-05,
277
+ "loss": 1.2458,
278
+ "mean_token_accuracy": 0.5520103573799133,
279
+ "num_tokens": 349210.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.458599328994751,
284
+ "epoch": 0.30434782608695654,
285
+ "grad_norm": 0.930942177772522,
286
+ "learning_rate": 6.0652173913043487e-05,
287
+ "loss": 1.2219,
288
+ "mean_token_accuracy": 0.5603324949741364,
289
+ "num_tokens": 361465.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.4730467200279236,
294
+ "epoch": 0.31521739130434784,
295
+ "grad_norm": 0.9836443066596985,
296
+ "learning_rate": 6.282608695652175e-05,
297
+ "loss": 1.2493,
298
+ "mean_token_accuracy": 0.5466845005750656,
299
+ "num_tokens": 374931.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.4596371173858642,
304
+ "epoch": 0.32608695652173914,
305
+ "grad_norm": 0.9860939383506775,
306
+ "learning_rate": 6.500000000000001e-05,
307
+ "loss": 1.2174,
308
+ "mean_token_accuracy": 0.556469538807869,
309
+ "num_tokens": 387929.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.4784631490707398,
314
+ "epoch": 0.33695652173913043,
315
+ "grad_norm": 0.8261193037033081,
316
+ "learning_rate": 6.717391304347827e-05,
317
+ "loss": 1.2191,
318
+ "mean_token_accuracy": 0.5600455164909363,
319
+ "num_tokens": 401392.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.4627429723739624,
324
+ "epoch": 0.34782608695652173,
325
+ "grad_norm": 0.896903395652771,
326
+ "learning_rate": 6.934782608695653e-05,
327
+ "loss": 1.1987,
328
+ "mean_token_accuracy": 0.5688268154859543,
329
+ "num_tokens": 414466.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.481223452091217,
334
+ "epoch": 0.358695652173913,
335
+ "grad_norm": 0.9765130877494812,
336
+ "learning_rate": 7.152173913043479e-05,
337
+ "loss": 1.2161,
338
+ "mean_token_accuracy": 0.5661008894443512,
339
+ "num_tokens": 427231.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.5086342811584472,
344
+ "epoch": 0.3695652173913043,
345
+ "grad_norm": 0.8136937022209167,
346
+ "learning_rate": 7.369565217391304e-05,
347
+ "loss": 1.2884,
348
+ "mean_token_accuracy": 0.5307422339916229,
349
+ "num_tokens": 439856.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.4857903122901917,
354
+ "epoch": 0.3804347826086957,
355
+ "grad_norm": 0.913378894329071,
356
+ "learning_rate": 7.58695652173913e-05,
357
+ "loss": 1.2631,
358
+ "mean_token_accuracy": 0.5459983497858047,
359
+ "num_tokens": 452683.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.5006132960319518,
364
+ "epoch": 0.391304347826087,
365
+ "grad_norm": 1.0260237455368042,
366
+ "learning_rate": 7.804347826086957e-05,
367
+ "loss": 1.2587,
368
+ "mean_token_accuracy": 0.5429587304592133,
369
+ "num_tokens": 465274.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.4861261010169984,
374
+ "epoch": 0.40217391304347827,
375
+ "grad_norm": 1.04011869430542,
376
+ "learning_rate": 8.021739130434783e-05,
377
+ "loss": 1.2147,
378
+ "mean_token_accuracy": 0.5620492398738861,
379
+ "num_tokens": 478175.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.4943390011787414,
384
+ "epoch": 0.41304347826086957,
385
+ "grad_norm": 0.9155416488647461,
386
+ "learning_rate": 8.23913043478261e-05,
387
+ "loss": 1.2128,
388
+ "mean_token_accuracy": 0.5667012810707093,
389
+ "num_tokens": 491001.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.5116252064704896,
394
+ "epoch": 0.42391304347826086,
395
+ "grad_norm": 0.8238904476165771,
396
+ "learning_rate": 8.456521739130435e-05,
397
+ "loss": 1.2677,
398
+ "mean_token_accuracy": 0.5370148032903671,
399
+ "num_tokens": 503764.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.4961246132850647,
404
+ "epoch": 0.43478260869565216,
405
+ "grad_norm": 0.8830587863922119,
406
+ "learning_rate": 8.673913043478261e-05,
407
+ "loss": 1.1999,
408
+ "mean_token_accuracy": 0.5743164956569672,
409
+ "num_tokens": 516294.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.5065942287445069,
414
+ "epoch": 0.44565217391304346,
415
+ "grad_norm": 0.9117815494537354,
416
+ "learning_rate": 8.891304347826088e-05,
417
+ "loss": 1.2607,
418
+ "mean_token_accuracy": 0.550678727030754,
419
+ "num_tokens": 529384.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.5079344272613526,
424
+ "epoch": 0.45652173913043476,
425
+ "grad_norm": 0.8730387091636658,
426
+ "learning_rate": 9.108695652173914e-05,
427
+ "loss": 1.2087,
428
+ "mean_token_accuracy": 0.5660586059093475,
429
+ "num_tokens": 542010.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.5147196769714355,
434
+ "epoch": 0.4673913043478261,
435
+ "grad_norm": 0.7791972160339355,
436
+ "learning_rate": 9.32608695652174e-05,
437
+ "loss": 1.2471,
438
+ "mean_token_accuracy": 0.5513437986373901,
439
+ "num_tokens": 554428.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.5182705640792846,
444
+ "epoch": 0.4782608695652174,
445
+ "grad_norm": 0.7569729089736938,
446
+ "learning_rate": 9.543478260869566e-05,
447
+ "loss": 1.2876,
448
+ "mean_token_accuracy": 0.5325394898653031,
449
+ "num_tokens": 567462.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.5036618828773498,
454
+ "epoch": 0.4891304347826087,
455
+ "grad_norm": 0.7794932126998901,
456
+ "learning_rate": 9.760869565217392e-05,
457
+ "loss": 1.2539,
458
+ "mean_token_accuracy": 0.5439064025878906,
459
+ "num_tokens": 580377.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.4947790503501892,
464
+ "epoch": 0.5,
465
+ "grad_norm": 0.8008731007575989,
466
+ "learning_rate": 9.978260869565218e-05,
467
+ "loss": 1.2352,
468
+ "mean_token_accuracy": 0.5524563610553741,
469
+ "num_tokens": 593597.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.5091761827468873,
474
+ "epoch": 0.5108695652173914,
475
+ "grad_norm": 0.9790273904800415,
476
+ "learning_rate": 9.999973836157333e-05,
477
+ "loss": 1.2448,
478
+ "mean_token_accuracy": 0.5587224543094635,
479
+ "num_tokens": 606659.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.5102417111396789,
484
+ "epoch": 0.5217391304347826,
485
+ "grad_norm": 0.9725663065910339,
486
+ "learning_rate": 9.999883393595947e-05,
487
+ "loss": 1.2366,
488
+ "mean_token_accuracy": 0.555170550942421,
489
+ "num_tokens": 619406.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.5230854153633118,
494
+ "epoch": 0.532608695652174,
495
+ "grad_norm": 1.0150320529937744,
496
+ "learning_rate": 9.999728350473721e-05,
497
+ "loss": 1.2304,
498
+ "mean_token_accuracy": 0.5601270943880081,
499
+ "num_tokens": 632194.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.5028501629829407,
504
+ "epoch": 0.5434782608695652,
505
+ "grad_norm": 0.8656931519508362,
506
+ "learning_rate": 9.99950870879387e-05,
507
+ "loss": 1.2286,
508
+ "mean_token_accuracy": 0.5571267485618592,
509
+ "num_tokens": 645327.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.5302343845367432,
514
+ "epoch": 0.5543478260869565,
515
+ "grad_norm": 0.7537740468978882,
516
+ "learning_rate": 9.99922447139426e-05,
517
+ "loss": 1.2342,
518
+ "mean_token_accuracy": 0.5612987399101257,
519
+ "num_tokens": 658378.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.5025633692741394,
524
+ "epoch": 0.5652173913043478,
525
+ "grad_norm": 0.6478719115257263,
526
+ "learning_rate": 9.998875641947354e-05,
527
+ "loss": 1.2429,
528
+ "mean_token_accuracy": 0.5501718163490296,
529
+ "num_tokens": 671323.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.5004254579544067,
534
+ "epoch": 0.5760869565217391,
535
+ "grad_norm": 1.2102432250976562,
536
+ "learning_rate": 9.998462224960175e-05,
537
+ "loss": 1.213,
538
+ "mean_token_accuracy": 0.5621294498443603,
539
+ "num_tokens": 683878.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.5195318818092347,
544
+ "epoch": 0.5869565217391305,
545
+ "grad_norm": 0.7961319088935852,
546
+ "learning_rate": 9.997984225774238e-05,
547
+ "loss": 1.2492,
548
+ "mean_token_accuracy": 0.5559745967388153,
549
+ "num_tokens": 696935.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.5280181407928466,
554
+ "epoch": 0.5978260869565217,
555
+ "grad_norm": 0.8740176558494568,
556
+ "learning_rate": 9.99744165056549e-05,
557
+ "loss": 1.2197,
558
+ "mean_token_accuracy": 0.5634395360946656,
559
+ "num_tokens": 710020.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.5327624678611755,
564
+ "epoch": 0.6086956521739131,
565
+ "grad_norm": 0.8462045192718506,
566
+ "learning_rate": 9.99683450634423e-05,
567
+ "loss": 1.2192,
568
+ "mean_token_accuracy": 0.5612434148788452,
569
+ "num_tokens": 723303.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.5094027161598205,
574
+ "epoch": 0.6195652173913043,
575
+ "grad_norm": 0.9465392231941223,
576
+ "learning_rate": 9.996162800955011e-05,
577
+ "loss": 1.1817,
578
+ "mean_token_accuracy": 0.5782815992832184,
579
+ "num_tokens": 735527.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.5501951813697814,
584
+ "epoch": 0.6304347826086957,
585
+ "grad_norm": 0.7445736527442932,
586
+ "learning_rate": 9.995426543076545e-05,
587
+ "loss": 1.2452,
588
+ "mean_token_accuracy": 0.5505437403917313,
589
+ "num_tokens": 748455.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.5227739214897156,
594
+ "epoch": 0.6413043478260869,
595
+ "grad_norm": 0.8378339409828186,
596
+ "learning_rate": 9.994625742221586e-05,
597
+ "loss": 1.2551,
598
+ "mean_token_accuracy": 0.5548771649599076,
599
+ "num_tokens": 761420.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.5428736090660096,
604
+ "epoch": 0.6521739130434783,
605
+ "grad_norm": 0.9249877333641052,
606
+ "learning_rate": 9.993760408736814e-05,
607
+ "loss": 1.282,
608
+ "mean_token_accuracy": 0.5393997848033905,
609
+ "num_tokens": 773676.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.5630847930908203,
614
+ "epoch": 0.6630434782608695,
615
+ "grad_norm": 0.8152625560760498,
616
+ "learning_rate": 9.992830553802696e-05,
617
+ "loss": 1.2763,
618
+ "mean_token_accuracy": 0.5402287989854813,
619
+ "num_tokens": 786757.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.532595145702362,
624
+ "epoch": 0.6739130434782609,
625
+ "grad_norm": 0.7313966751098633,
626
+ "learning_rate": 9.991836189433342e-05,
627
+ "loss": 1.2323,
628
+ "mean_token_accuracy": 0.5645015567541123,
629
+ "num_tokens": 799851.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.5160420179367065,
634
+ "epoch": 0.6847826086956522,
635
+ "grad_norm": 0.7158486843109131,
636
+ "learning_rate": 9.990777328476348e-05,
637
+ "loss": 1.2021,
638
+ "mean_token_accuracy": 0.555733984708786,
639
+ "num_tokens": 812648.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.5084859609603882,
644
+ "epoch": 0.6956521739130435,
645
+ "grad_norm": 0.7056333422660828,
646
+ "learning_rate": 9.98965398461264e-05,
647
+ "loss": 1.176,
648
+ "mean_token_accuracy": 0.5772889316082,
649
+ "num_tokens": 825054.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.5172175765037537,
654
+ "epoch": 0.7065217391304348,
655
+ "grad_norm": 0.8173061013221741,
656
+ "learning_rate": 9.988466172356282e-05,
657
+ "loss": 1.1893,
658
+ "mean_token_accuracy": 0.5774871349334717,
659
+ "num_tokens": 838148.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.5155822157859802,
664
+ "epoch": 0.717391304347826,
665
+ "grad_norm": 0.7483378648757935,
666
+ "learning_rate": 9.9872139070543e-05,
667
+ "loss": 1.2377,
668
+ "mean_token_accuracy": 0.5529649972915649,
669
+ "num_tokens": 851079.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.5392202258110046,
674
+ "epoch": 0.7282608695652174,
675
+ "grad_norm": 0.8020080924034119,
676
+ "learning_rate": 9.985897204886481e-05,
677
+ "loss": 1.2471,
678
+ "mean_token_accuracy": 0.5591055184602738,
679
+ "num_tokens": 863673.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.520468044281006,
684
+ "epoch": 0.7391304347826086,
685
+ "grad_norm": 0.7957432866096497,
686
+ "learning_rate": 9.984516082865159e-05,
687
+ "loss": 1.2582,
688
+ "mean_token_accuracy": 0.5404952645301819,
689
+ "num_tokens": 876764.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.5024335741996766,
694
+ "epoch": 0.75,
695
+ "grad_norm": 0.8745436072349548,
696
+ "learning_rate": 9.983070558835002e-05,
697
+ "loss": 1.2029,
698
+ "mean_token_accuracy": 0.5673643052577972,
699
+ "num_tokens": 889851.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.5227225780487061,
704
+ "epoch": 0.7608695652173914,
705
+ "grad_norm": 0.7866286039352417,
706
+ "learning_rate": 9.981560651472781e-05,
707
+ "loss": 1.2597,
708
+ "mean_token_accuracy": 0.5447615504264831,
709
+ "num_tokens": 903182.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.5330517888069153,
714
+ "epoch": 0.7717391304347826,
715
+ "grad_norm": 0.696030855178833,
716
+ "learning_rate": 9.97998638028712e-05,
717
+ "loss": 1.2417,
718
+ "mean_token_accuracy": 0.5569504171609878,
719
+ "num_tokens": 916564.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.5023605585098267,
724
+ "epoch": 0.782608695652174,
725
+ "grad_norm": 0.7980480194091797,
726
+ "learning_rate": 9.978347765618257e-05,
727
+ "loss": 1.2073,
728
+ "mean_token_accuracy": 0.562690931558609,
729
+ "num_tokens": 929820.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.5466702461242676,
734
+ "epoch": 0.7934782608695652,
735
+ "grad_norm": 0.8441147804260254,
736
+ "learning_rate": 9.976644828637767e-05,
737
+ "loss": 1.2859,
738
+ "mean_token_accuracy": 0.5330282121896743,
739
+ "num_tokens": 942449.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.515641415119171,
744
+ "epoch": 0.8043478260869565,
745
+ "grad_norm": 0.8833957314491272,
746
+ "learning_rate": 9.974877591348304e-05,
747
+ "loss": 1.2627,
748
+ "mean_token_accuracy": 0.5418030679225921,
749
+ "num_tokens": 955620.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 1.5292868852615356,
754
+ "epoch": 0.8152173913043478,
755
+ "grad_norm": 0.8666150569915771,
756
+ "learning_rate": 9.973046076583301e-05,
757
+ "loss": 1.2364,
758
+ "mean_token_accuracy": 0.5494832009077072,
759
+ "num_tokens": 968954.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.5135694026947022,
764
+ "epoch": 0.8260869565217391,
765
+ "grad_norm": 0.9172241687774658,
766
+ "learning_rate": 9.97115030800669e-05,
767
+ "loss": 1.2053,
768
+ "mean_token_accuracy": 0.5607668071985245,
769
+ "num_tokens": 981323.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.5214222908020019,
774
+ "epoch": 0.8369565217391305,
775
+ "grad_norm": 0.9353718161582947,
776
+ "learning_rate": 9.969190310112579e-05,
777
+ "loss": 1.225,
778
+ "mean_token_accuracy": 0.5599299073219299,
779
+ "num_tokens": 994834.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.5419185280799865,
784
+ "epoch": 0.8478260869565217,
785
+ "grad_norm": 0.717232882976532,
786
+ "learning_rate": 9.967166108224957e-05,
787
+ "loss": 1.2848,
788
+ "mean_token_accuracy": 0.5360999226570129,
789
+ "num_tokens": 1007806.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.545407807826996,
794
+ "epoch": 0.8586956521739131,
795
+ "grad_norm": 0.745928943157196,
796
+ "learning_rate": 9.965077728497348e-05,
797
+ "loss": 1.2683,
798
+ "mean_token_accuracy": 0.5427737534046173,
799
+ "num_tokens": 1021093.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.5416621446609498,
804
+ "epoch": 0.8695652173913043,
805
+ "grad_norm": 0.8545331954956055,
806
+ "learning_rate": 9.96292519791248e-05,
807
+ "loss": 1.3036,
808
+ "mean_token_accuracy": 0.5352708637714386,
809
+ "num_tokens": 1034317.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.516196882724762,
814
+ "epoch": 0.8804347826086957,
815
+ "grad_norm": 0.8239868879318237,
816
+ "learning_rate": 9.96070854428194e-05,
817
+ "loss": 1.1943,
818
+ "mean_token_accuracy": 0.568702632188797,
819
+ "num_tokens": 1047679.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.5478002548217773,
824
+ "epoch": 0.8913043478260869,
825
+ "grad_norm": 0.9187906980514526,
826
+ "learning_rate": 9.958427796245808e-05,
827
+ "loss": 1.2707,
828
+ "mean_token_accuracy": 0.5460701882839203,
829
+ "num_tokens": 1060840.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.540980589389801,
834
+ "epoch": 0.9021739130434783,
835
+ "grad_norm": 0.774869978427887,
836
+ "learning_rate": 9.956082983272293e-05,
837
+ "loss": 1.2397,
838
+ "mean_token_accuracy": 0.5464379012584686,
839
+ "num_tokens": 1073529.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.5131322622299195,
844
+ "epoch": 0.9130434782608695,
845
+ "grad_norm": 1.029721975326538,
846
+ "learning_rate": 9.953674135657345e-05,
847
+ "loss": 1.2198,
848
+ "mean_token_accuracy": 0.5641603857278824,
849
+ "num_tokens": 1086600.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.5259755611419679,
854
+ "epoch": 0.9239130434782609,
855
+ "grad_norm": 0.8057295083999634,
856
+ "learning_rate": 9.951201284524275e-05,
857
+ "loss": 1.2492,
858
+ "mean_token_accuracy": 0.5562368750572204,
859
+ "num_tokens": 1099737.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.5159748077392579,
864
+ "epoch": 0.9347826086956522,
865
+ "grad_norm": 0.6001420617103577,
866
+ "learning_rate": 9.94866446182334e-05,
867
+ "loss": 1.2524,
868
+ "mean_token_accuracy": 0.5458084315061569,
869
+ "num_tokens": 1112239.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 1.524741494655609,
874
+ "epoch": 0.9456521739130435,
875
+ "grad_norm": 0.847523033618927,
876
+ "learning_rate": 9.94606370033134e-05,
877
+ "loss": 1.2245,
878
+ "mean_token_accuracy": 0.5601188719272614,
879
+ "num_tokens": 1125191.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.5371912598609925,
884
+ "epoch": 0.9565217391304348,
885
+ "grad_norm": 0.767745852470398,
886
+ "learning_rate": 9.943399033651189e-05,
887
+ "loss": 1.2319,
888
+ "mean_token_accuracy": 0.5546965420246124,
889
+ "num_tokens": 1138077.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.5223184943199157,
894
+ "epoch": 0.967391304347826,
895
+ "grad_norm": 0.9313151836395264,
896
+ "learning_rate": 9.94067049621148e-05,
897
+ "loss": 1.2237,
898
+ "mean_token_accuracy": 0.5578917026519775,
899
+ "num_tokens": 1151364.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.530699372291565,
904
+ "epoch": 0.9782608695652174,
905
+ "grad_norm": 0.7053420543670654,
906
+ "learning_rate": 9.937878123266044e-05,
907
+ "loss": 1.2269,
908
+ "mean_token_accuracy": 0.5488695651292801,
909
+ "num_tokens": 1164326.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.5197353124618531,
914
+ "epoch": 0.9891304347826086,
915
+ "grad_norm": 0.9986150860786438,
916
+ "learning_rate": 9.9350219508935e-05,
917
+ "loss": 1.2106,
918
+ "mean_token_accuracy": 0.5582584798336029,
919
+ "num_tokens": 1176914.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.5376808762550354,
924
+ "epoch": 1.0,
925
+ "grad_norm": 0.7129160165786743,
926
+ "learning_rate": 9.93210201599677e-05,
927
+ "loss": 1.2377,
928
+ "mean_token_accuracy": 0.557282817363739,
929
+ "num_tokens": 1189994.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.5522505402565003,
934
+ "epoch": 1.0108695652173914,
935
+ "grad_norm": 0.9139987230300903,
936
+ "learning_rate": 9.929118356302621e-05,
937
+ "loss": 1.2492,
938
+ "mean_token_accuracy": 0.5444983661174774,
939
+ "num_tokens": 1202961.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.5519829273223877,
944
+ "epoch": 1.0217391304347827,
945
+ "grad_norm": 1.0422664880752563,
946
+ "learning_rate": 9.926071010361173e-05,
947
+ "loss": 1.1957,
948
+ "mean_token_accuracy": 0.5779279708862305,
949
+ "num_tokens": 1215901.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.5434082865715026,
954
+ "epoch": 1.0326086956521738,
955
+ "grad_norm": 1.0472567081451416,
956
+ "learning_rate": 9.922960017545395e-05,
957
+ "loss": 1.2263,
958
+ "mean_token_accuracy": 0.5640866041183472,
959
+ "num_tokens": 1228567.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 1.5352994203567505,
964
+ "epoch": 1.0434782608695652,
965
+ "grad_norm": 1.0810585021972656,
966
+ "learning_rate": 9.919785418050598e-05,
967
+ "loss": 1.1876,
968
+ "mean_token_accuracy": 0.5709751307964325,
969
+ "num_tokens": 1241529.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 1.4996863842010497,
974
+ "epoch": 1.0543478260869565,
975
+ "grad_norm": 1.1204661130905151,
976
+ "learning_rate": 9.916547252893923e-05,
977
+ "loss": 1.1354,
978
+ "mean_token_accuracy": 0.5961336076259613,
979
+ "num_tokens": 1254137.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 1.5315279841423035,
984
+ "epoch": 1.065217391304348,
985
+ "grad_norm": 1.0741767883300781,
986
+ "learning_rate": 9.9132455639138e-05,
987
+ "loss": 1.1422,
988
+ "mean_token_accuracy": 0.5875493228435517,
989
+ "num_tokens": 1266871.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 1.516059410572052,
994
+ "epoch": 1.0760869565217392,
995
+ "grad_norm": 1.1965429782867432,
996
+ "learning_rate": 9.90988039376942e-05,
997
+ "loss": 1.1438,
998
+ "mean_token_accuracy": 0.5906685352325439,
999
+ "num_tokens": 1279655.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 1.5148675918579102,
1004
+ "epoch": 1.0869565217391304,
1005
+ "grad_norm": 1.1992353200912476,
1006
+ "learning_rate": 9.906451785940167e-05,
1007
+ "loss": 1.1636,
1008
+ "mean_token_accuracy": 0.5710582077503205,
1009
+ "num_tokens": 1292202.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 1.5187682271003724,
1014
+ "epoch": 1.0978260869565217,
1015
+ "grad_norm": 1.0606764554977417,
1016
+ "learning_rate": 9.902959784725077e-05,
1017
+ "loss": 1.1763,
1018
+ "mean_token_accuracy": 0.5760969400405884,
1019
+ "num_tokens": 1305284.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 1.5265469312667848,
1024
+ "epoch": 1.108695652173913,
1025
+ "grad_norm": 1.02944815158844,
1026
+ "learning_rate": 9.899404435242246e-05,
1027
+ "loss": 1.2096,
1028
+ "mean_token_accuracy": 0.5624277234077454,
1029
+ "num_tokens": 1318408.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 1.5455774545669556,
1034
+ "epoch": 1.1195652173913044,
1035
+ "grad_norm": 1.1493759155273438,
1036
+ "learning_rate": 9.895785783428262e-05,
1037
+ "loss": 1.1652,
1038
+ "mean_token_accuracy": 0.5867336988449097,
1039
+ "num_tokens": 1331156.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 1.5371973156929015,
1044
+ "epoch": 1.1304347826086956,
1045
+ "grad_norm": 0.9468239545822144,
1046
+ "learning_rate": 9.8921038760376e-05,
1047
+ "loss": 1.2371,
1048
+ "mean_token_accuracy": 0.5544474184513092,
1049
+ "num_tokens": 1343904.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 1.5403631925582886,
1054
+ "epoch": 1.141304347826087,
1055
+ "grad_norm": 1.1717609167099,
1056
+ "learning_rate": 9.888358760642029e-05,
1057
+ "loss": 1.1394,
1058
+ "mean_token_accuracy": 0.5933512449264526,
1059
+ "num_tokens": 1356797.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 1.5518387794494628,
1064
+ "epoch": 1.1521739130434783,
1065
+ "grad_norm": 1.2024801969528198,
1066
+ "learning_rate": 9.884550485629987e-05,
1067
+ "loss": 1.2065,
1068
+ "mean_token_accuracy": 0.5667118012905121,
1069
+ "num_tokens": 1369690.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 1.5736596703529357,
1074
+ "epoch": 1.1630434782608696,
1075
+ "grad_norm": 1.0323596000671387,
1076
+ "learning_rate": 9.88067910020596e-05,
1077
+ "loss": 1.2124,
1078
+ "mean_token_accuracy": 0.5691272497177124,
1079
+ "num_tokens": 1382561.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 1.57814359664917,
1084
+ "epoch": 1.1739130434782608,
1085
+ "grad_norm": 1.1128944158554077,
1086
+ "learning_rate": 9.876744654389854e-05,
1087
+ "loss": 1.2319,
1088
+ "mean_token_accuracy": 0.554848113656044,
1089
+ "num_tokens": 1395409.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 1.5651036262512208,
1094
+ "epoch": 1.184782608695652,
1095
+ "grad_norm": 1.1131497621536255,
1096
+ "learning_rate": 9.872747199016328e-05,
1097
+ "loss": 1.1995,
1098
+ "mean_token_accuracy": 0.5680587291717529,
1099
+ "num_tokens": 1408511.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 1.519801914691925,
1104
+ "epoch": 1.1956521739130435,
1105
+ "grad_norm": 0.8381641507148743,
1106
+ "learning_rate": 9.868686785734165e-05,
1107
+ "loss": 1.1729,
1108
+ "mean_token_accuracy": 0.5780038118362427,
1109
+ "num_tokens": 1421328.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 1.5411308765411378,
1114
+ "epoch": 1.2065217391304348,
1115
+ "grad_norm": 1.1784008741378784,
1116
+ "learning_rate": 9.86456346700558e-05,
1117
+ "loss": 1.2026,
1118
+ "mean_token_accuracy": 0.5581619143486023,
1119
+ "num_tokens": 1434644.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 1.524932038784027,
1124
+ "epoch": 1.2173913043478262,
1125
+ "grad_norm": 0.9289618730545044,
1126
+ "learning_rate": 9.860377296105556e-05,
1127
+ "loss": 1.219,
1128
+ "mean_token_accuracy": 0.557993471622467,
1129
+ "num_tokens": 1447469.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 1.5029574513435364,
1134
+ "epoch": 1.2282608695652173,
1135
+ "grad_norm": 1.0168135166168213,
1136
+ "learning_rate": 9.856128327121155e-05,
1137
+ "loss": 1.1589,
1138
+ "mean_token_accuracy": 0.578672569990158,
1139
+ "num_tokens": 1460202.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 1.5095925211906434,
1144
+ "epoch": 1.2391304347826086,
1145
+ "grad_norm": 1.052454948425293,
1146
+ "learning_rate": 9.85181661495081e-05,
1147
+ "loss": 1.2232,
1148
+ "mean_token_accuracy": 0.5522898703813552,
1149
+ "num_tokens": 1473114.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 1.5059074401855468,
1154
+ "epoch": 1.25,
1155
+ "grad_norm": 1.20883309841156,
1156
+ "learning_rate": 9.847442215303626e-05,
1157
+ "loss": 1.2172,
1158
+ "mean_token_accuracy": 0.5659465253353119,
1159
+ "num_tokens": 1485990.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 1.494919514656067,
1164
+ "epoch": 1.2608695652173914,
1165
+ "grad_norm": 1.1653634309768677,
1166
+ "learning_rate": 9.843005184698655e-05,
1167
+ "loss": 1.1817,
1168
+ "mean_token_accuracy": 0.5764101088047028,
1169
+ "num_tokens": 1498939.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 1.5184181690216065,
1174
+ "epoch": 1.2717391304347827,
1175
+ "grad_norm": 1.1174242496490479,
1176
+ "learning_rate": 9.838505580464168e-05,
1177
+ "loss": 1.1976,
1178
+ "mean_token_accuracy": 0.5707351744174958,
1179
+ "num_tokens": 1511943.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 1.5217233657836915,
1184
+ "epoch": 1.2826086956521738,
1185
+ "grad_norm": 1.0029795169830322,
1186
+ "learning_rate": 9.833943460736912e-05,
1187
+ "loss": 1.2296,
1188
+ "mean_token_accuracy": 0.5572409898042678,
1189
+ "num_tokens": 1525135.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 1.514461922645569,
1194
+ "epoch": 1.2934782608695652,
1195
+ "grad_norm": 1.2473056316375732,
1196
+ "learning_rate": 9.829318884461359e-05,
1197
+ "loss": 1.221,
1198
+ "mean_token_accuracy": 0.5566778779029846,
1199
+ "num_tokens": 1537699.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 1.5298507332801818,
1204
+ "epoch": 1.3043478260869565,
1205
+ "grad_norm": 1.068049430847168,
1206
+ "learning_rate": 9.824631911388948e-05,
1207
+ "loss": 1.248,
1208
+ "mean_token_accuracy": 0.5430671572685242,
1209
+ "num_tokens": 1550938.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 1.5463980197906495,
1214
+ "epoch": 1.315217391304348,
1215
+ "grad_norm": 1.0760388374328613,
1216
+ "learning_rate": 9.819882602077309e-05,
1217
+ "loss": 1.2825,
1218
+ "mean_token_accuracy": 0.5330462247133255,
1219
+ "num_tokens": 1563597.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 1.5457575082778932,
1224
+ "epoch": 1.3260869565217392,
1225
+ "grad_norm": 1.1161272525787354,
1226
+ "learning_rate": 9.815071017889482e-05,
1227
+ "loss": 1.2598,
1228
+ "mean_token_accuracy": 0.543943053483963,
1229
+ "num_tokens": 1576301.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 1.5349620819091796,
1234
+ "epoch": 1.3369565217391304,
1235
+ "grad_norm": 1.1779778003692627,
1236
+ "learning_rate": 9.810197220993123e-05,
1237
+ "loss": 1.2551,
1238
+ "mean_token_accuracy": 0.5386941403150558,
1239
+ "num_tokens": 1589776.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 1.5158817052841187,
1244
+ "epoch": 1.3478260869565217,
1245
+ "grad_norm": 1.1150175333023071,
1246
+ "learning_rate": 9.805261274359705e-05,
1247
+ "loss": 1.193,
1248
+ "mean_token_accuracy": 0.5642519950866699,
1249
+ "num_tokens": 1602239.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 1.512274718284607,
1254
+ "epoch": 1.358695652173913,
1255
+ "grad_norm": 0.9392043948173523,
1256
+ "learning_rate": 9.800263241763698e-05,
1257
+ "loss": 1.2334,
1258
+ "mean_token_accuracy": 0.5577278465032578,
1259
+ "num_tokens": 1615621.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 1.5087523460388184,
1264
+ "epoch": 1.3695652173913042,
1265
+ "grad_norm": 0.9521236419677734,
1266
+ "learning_rate": 9.795203187781751e-05,
1267
+ "loss": 1.1651,
1268
+ "mean_token_accuracy": 0.5836262464523315,
1269
+ "num_tokens": 1628741.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 1.5212602257728576,
1274
+ "epoch": 1.3804347826086958,
1275
+ "grad_norm": 0.9689566493034363,
1276
+ "learning_rate": 9.790081177791852e-05,
1277
+ "loss": 1.1944,
1278
+ "mean_token_accuracy": 0.572248637676239,
1279
+ "num_tokens": 1641646.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 1.521955931186676,
1284
+ "epoch": 1.391304347826087,
1285
+ "grad_norm": 1.016711711883545,
1286
+ "learning_rate": 9.784897277972491e-05,
1287
+ "loss": 1.2105,
1288
+ "mean_token_accuracy": 0.5605559885501862,
1289
+ "num_tokens": 1654499.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 1.5126453638076782,
1294
+ "epoch": 1.4021739130434783,
1295
+ "grad_norm": 1.1951313018798828,
1296
+ "learning_rate": 9.779651555301794e-05,
1297
+ "loss": 1.2305,
1298
+ "mean_token_accuracy": 0.5537042915821075,
1299
+ "num_tokens": 1667748.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 1.528828752040863,
1304
+ "epoch": 1.4130434782608696,
1305
+ "grad_norm": 1.1385231018066406,
1306
+ "learning_rate": 9.77434407755667e-05,
1307
+ "loss": 1.2294,
1308
+ "mean_token_accuracy": 0.554050150513649,
1309
+ "num_tokens": 1681184.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 1.510583758354187,
1314
+ "epoch": 1.4239130434782608,
1315
+ "grad_norm": 1.0576328039169312,
1316
+ "learning_rate": 9.768974913311922e-05,
1317
+ "loss": 1.2674,
1318
+ "mean_token_accuracy": 0.5414516568183899,
1319
+ "num_tokens": 1693818.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 1.5150775551795959,
1324
+ "epoch": 1.434782608695652,
1325
+ "grad_norm": 1.3364728689193726,
1326
+ "learning_rate": 9.763544131939374e-05,
1327
+ "loss": 1.2075,
1328
+ "mean_token_accuracy": 0.559964632987976,
1329
+ "num_tokens": 1706939.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 1.5151704668998718,
1334
+ "epoch": 1.4456521739130435,
1335
+ "grad_norm": 1.02871835231781,
1336
+ "learning_rate": 9.758051803606971e-05,
1337
+ "loss": 1.2487,
1338
+ "mean_token_accuracy": 0.552227908372879,
1339
+ "num_tokens": 1719315.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 1.5152636528015138,
1344
+ "epoch": 1.4565217391304348,
1345
+ "grad_norm": 1.0097824335098267,
1346
+ "learning_rate": 9.75249799927786e-05,
1347
+ "loss": 1.2263,
1348
+ "mean_token_accuracy": 0.5533849179744721,
1349
+ "num_tokens": 1731891.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 1.512537384033203,
1354
+ "epoch": 1.4673913043478262,
1355
+ "grad_norm": 1.2632033824920654,
1356
+ "learning_rate": 9.746882790709491e-05,
1357
+ "loss": 1.222,
1358
+ "mean_token_accuracy": 0.5614925265312195,
1359
+ "num_tokens": 1744427.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 1.5295302748680115,
1364
+ "epoch": 1.4782608695652173,
1365
+ "grad_norm": 1.113368034362793,
1366
+ "learning_rate": 9.741206250452683e-05,
1367
+ "loss": 1.2735,
1368
+ "mean_token_accuracy": 0.539223712682724,
1369
+ "num_tokens": 1757083.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 1.536200964450836,
1374
+ "epoch": 1.4891304347826086,
1375
+ "grad_norm": 1.1522810459136963,
1376
+ "learning_rate": 9.735468451850681e-05,
1377
+ "loss": 1.2152,
1378
+ "mean_token_accuracy": 0.565186282992363,
1379
+ "num_tokens": 1769982.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 1.495800745487213,
1384
+ "epoch": 1.5,
1385
+ "grad_norm": 1.2632598876953125,
1386
+ "learning_rate": 9.729669469038216e-05,
1387
+ "loss": 1.1635,
1388
+ "mean_token_accuracy": 0.5871178984642029,
1389
+ "num_tokens": 1783102.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 1.535517191886902,
1394
+ "epoch": 1.5108695652173914,
1395
+ "grad_norm": 0.9593290090560913,
1396
+ "learning_rate": 9.723809376940544e-05,
1397
+ "loss": 1.2108,
1398
+ "mean_token_accuracy": 0.5709479689598084,
1399
+ "num_tokens": 1796398.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 1.529611337184906,
1404
+ "epoch": 1.5217391304347827,
1405
+ "grad_norm": 1.0819748640060425,
1406
+ "learning_rate": 9.717888251272477e-05,
1407
+ "loss": 1.1972,
1408
+ "mean_token_accuracy": 0.5633429378271103,
1409
+ "num_tokens": 1809379.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 1.5493282318115233,
1414
+ "epoch": 1.5326086956521738,
1415
+ "grad_norm": 0.9472999572753906,
1416
+ "learning_rate": 9.71190616853741e-05,
1417
+ "loss": 1.2616,
1418
+ "mean_token_accuracy": 0.5486618399620056,
1419
+ "num_tokens": 1822664.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 1.4989375710487365,
1424
+ "epoch": 1.5434782608695652,
1425
+ "grad_norm": 1.2883214950561523,
1426
+ "learning_rate": 9.705863206026321e-05,
1427
+ "loss": 1.2137,
1428
+ "mean_token_accuracy": 0.558601850271225,
1429
+ "num_tokens": 1835336.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 1.5061516761779785,
1434
+ "epoch": 1.5543478260869565,
1435
+ "grad_norm": 0.9577755928039551,
1436
+ "learning_rate": 9.699759441816787e-05,
1437
+ "loss": 1.1739,
1438
+ "mean_token_accuracy": 0.577557110786438,
1439
+ "num_tokens": 1847755.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 1.5141437649726868,
1444
+ "epoch": 1.5652173913043477,
1445
+ "grad_norm": 1.0751005411148071,
1446
+ "learning_rate": 9.693594954771965e-05,
1447
+ "loss": 1.231,
1448
+ "mean_token_accuracy": 0.5506497710943222,
1449
+ "num_tokens": 1860302.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 1.5419356107711792,
1454
+ "epoch": 1.5760869565217392,
1455
+ "grad_norm": 1.0141667127609253,
1456
+ "learning_rate": 9.687369824539577e-05,
1457
+ "loss": 1.2788,
1458
+ "mean_token_accuracy": 0.5303231775760651,
1459
+ "num_tokens": 1873093.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 1.520876133441925,
1464
+ "epoch": 1.5869565217391304,
1465
+ "grad_norm": 1.109215259552002,
1466
+ "learning_rate": 9.68108413155088e-05,
1467
+ "loss": 1.2333,
1468
+ "mean_token_accuracy": 0.5601014912128448,
1469
+ "num_tokens": 1886177.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 1.4981224894523621,
1474
+ "epoch": 1.5978260869565217,
1475
+ "grad_norm": 0.9200493097305298,
1476
+ "learning_rate": 9.674737957019624e-05,
1477
+ "loss": 1.1852,
1478
+ "mean_token_accuracy": 0.5700576066970825,
1479
+ "num_tokens": 1899113.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 1.5140800833702088,
1484
+ "epoch": 1.608695652173913,
1485
+ "grad_norm": 1.190007209777832,
1486
+ "learning_rate": 9.66833138294101e-05,
1487
+ "loss": 1.1929,
1488
+ "mean_token_accuracy": 0.5691904962062836,
1489
+ "num_tokens": 1912474.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 1.5299779295921325,
1494
+ "epoch": 1.6195652173913042,
1495
+ "grad_norm": 0.9787003397941589,
1496
+ "learning_rate": 9.661864492090625e-05,
1497
+ "loss": 1.2179,
1498
+ "mean_token_accuracy": 0.553766930103302,
1499
+ "num_tokens": 1925685.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 1.5431510925292968,
1504
+ "epoch": 1.6304347826086958,
1505
+ "grad_norm": 1.1734333038330078,
1506
+ "learning_rate": 9.655337368023371e-05,
1507
+ "loss": 1.2108,
1508
+ "mean_token_accuracy": 0.5539384454488754,
1509
+ "num_tokens": 1938610.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.5246233105659486,
1514
+ "epoch": 1.641304347826087,
1515
+ "grad_norm": 1.072691559791565,
1516
+ "learning_rate": 9.64875009507239e-05,
1517
+ "loss": 1.1999,
1518
+ "mean_token_accuracy": 0.5761029601097107,
1519
+ "num_tokens": 1951241.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.538881742954254,
1524
+ "epoch": 1.6521739130434783,
1525
+ "grad_norm": 1.0783456563949585,
1526
+ "learning_rate": 9.642102758347973e-05,
1527
+ "loss": 1.2443,
1528
+ "mean_token_accuracy": 0.5502734839916229,
1529
+ "num_tokens": 1964816.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.550068199634552,
1534
+ "epoch": 1.6630434782608696,
1535
+ "grad_norm": 1.0582056045532227,
1536
+ "learning_rate": 9.63539544373646e-05,
1537
+ "loss": 1.2182,
1538
+ "mean_token_accuracy": 0.5598388969898224,
1539
+ "num_tokens": 1977930.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.5344447016716003,
1544
+ "epoch": 1.6739130434782608,
1545
+ "grad_norm": 0.9788505434989929,
1546
+ "learning_rate": 9.628628237899126e-05,
1547
+ "loss": 1.1852,
1548
+ "mean_token_accuracy": 0.5595145970582962,
1549
+ "num_tokens": 1991032.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.5468374967575074,
1554
+ "epoch": 1.6847826086956523,
1555
+ "grad_norm": 1.0464048385620117,
1556
+ "learning_rate": 9.621801228271073e-05,
1557
+ "loss": 1.2175,
1558
+ "mean_token_accuracy": 0.5616866886615753,
1559
+ "num_tokens": 2004207.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.5422045588493347,
1564
+ "epoch": 1.6956521739130435,
1565
+ "grad_norm": 0.8307158946990967,
1566
+ "learning_rate": 9.614914503060083e-05,
1567
+ "loss": 1.2202,
1568
+ "mean_token_accuracy": 0.5515525698661804,
1569
+ "num_tokens": 2016969.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.5343055129051208,
1574
+ "epoch": 1.7065217391304348,
1575
+ "grad_norm": 1.198614239692688,
1576
+ "learning_rate": 9.607968151245498e-05,
1577
+ "loss": 1.1866,
1578
+ "mean_token_accuracy": 0.5771215856075287,
1579
+ "num_tokens": 2029750.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.5321205615997315,
1584
+ "epoch": 1.7173913043478262,
1585
+ "grad_norm": 0.9247676134109497,
1586
+ "learning_rate": 9.600962262577053e-05,
1587
+ "loss": 1.2205,
1588
+ "mean_token_accuracy": 0.5626431256532669,
1589
+ "num_tokens": 2043181.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.541359269618988,
1594
+ "epoch": 1.7282608695652173,
1595
+ "grad_norm": 1.0934436321258545,
1596
+ "learning_rate": 9.593896927573728e-05,
1597
+ "loss": 1.2397,
1598
+ "mean_token_accuracy": 0.5406488478183746,
1599
+ "num_tokens": 2056541.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.5094799280166626,
1604
+ "epoch": 1.7391304347826086,
1605
+ "grad_norm": 0.8803229928016663,
1606
+ "learning_rate": 9.586772237522573e-05,
1607
+ "loss": 1.2047,
1608
+ "mean_token_accuracy": 0.5659328937530518,
1609
+ "num_tokens": 2069752.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.4946965217590331,
1614
+ "epoch": 1.75,
1615
+ "grad_norm": 1.0182883739471436,
1616
+ "learning_rate": 9.579588284477526e-05,
1617
+ "loss": 1.1492,
1618
+ "mean_token_accuracy": 0.5829819083213806,
1619
+ "num_tokens": 2083119.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.483540380001068,
1624
+ "epoch": 1.7608695652173914,
1625
+ "grad_norm": 1.2263387441635132,
1626
+ "learning_rate": 9.572345161258235e-05,
1627
+ "loss": 1.1474,
1628
+ "mean_token_accuracy": 0.5895151972770691,
1629
+ "num_tokens": 2095862.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.5072382926940917,
1634
+ "epoch": 1.7717391304347827,
1635
+ "grad_norm": 0.8639858365058899,
1636
+ "learning_rate": 9.565042961448844e-05,
1637
+ "loss": 1.1997,
1638
+ "mean_token_accuracy": 0.5625985980033874,
1639
+ "num_tokens": 2108549.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.5286765098571777,
1644
+ "epoch": 1.7826086956521738,
1645
+ "grad_norm": 1.0652116537094116,
1646
+ "learning_rate": 9.557681779396797e-05,
1647
+ "loss": 1.2253,
1648
+ "mean_token_accuracy": 0.5569576025009155,
1649
+ "num_tokens": 2120871.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.5006824493408204,
1654
+ "epoch": 1.7934782608695652,
1655
+ "grad_norm": 0.949942946434021,
1656
+ "learning_rate": 9.550261710211608e-05,
1657
+ "loss": 1.1973,
1658
+ "mean_token_accuracy": 0.5634852379560471,
1659
+ "num_tokens": 2134097.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.5050195574760437,
1664
+ "epoch": 1.8043478260869565,
1665
+ "grad_norm": 1.016350507736206,
1666
+ "learning_rate": 9.542782849763637e-05,
1667
+ "loss": 1.1709,
1668
+ "mean_token_accuracy": 0.5780033886432647,
1669
+ "num_tokens": 2147811.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.5177413702011109,
1674
+ "epoch": 1.8152173913043477,
1675
+ "grad_norm": 1.264799952507019,
1676
+ "learning_rate": 9.535245294682857e-05,
1677
+ "loss": 1.2513,
1678
+ "mean_token_accuracy": 0.5521585702896118,
1679
+ "num_tokens": 2160506.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.5374733328819274,
1684
+ "epoch": 1.8260869565217392,
1685
+ "grad_norm": 1.1713751554489136,
1686
+ "learning_rate": 9.527649142357596e-05,
1687
+ "loss": 1.2708,
1688
+ "mean_token_accuracy": 0.5314113944768906,
1689
+ "num_tokens": 2173328.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.5083181142807007,
1694
+ "epoch": 1.8369565217391304,
1695
+ "grad_norm": 1.1553071737289429,
1696
+ "learning_rate": 9.519994490933279e-05,
1697
+ "loss": 1.206,
1698
+ "mean_token_accuracy": 0.5680734992027283,
1699
+ "num_tokens": 2186452.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.5291340470314025,
1704
+ "epoch": 1.8478260869565217,
1705
+ "grad_norm": 1.1443442106246948,
1706
+ "learning_rate": 9.51228143931117e-05,
1707
+ "loss": 1.2351,
1708
+ "mean_token_accuracy": 0.5539528131484985,
1709
+ "num_tokens": 2199594.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.5204999327659607,
1714
+ "epoch": 1.858695652173913,
1715
+ "grad_norm": 1.1584019660949707,
1716
+ "learning_rate": 9.504510087147088e-05,
1717
+ "loss": 1.2338,
1718
+ "mean_token_accuracy": 0.5519226849079132,
1719
+ "num_tokens": 2212135.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.5614403247833253,
1724
+ "epoch": 1.8695652173913042,
1725
+ "grad_norm": 1.0798224210739136,
1726
+ "learning_rate": 9.496680534850113e-05,
1727
+ "loss": 1.2534,
1728
+ "mean_token_accuracy": 0.5530328571796417,
1729
+ "num_tokens": 2225159.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.5276212096214294,
1734
+ "epoch": 1.8804347826086958,
1735
+ "grad_norm": 1.1296766996383667,
1736
+ "learning_rate": 9.488792883581299e-05,
1737
+ "loss": 1.1784,
1738
+ "mean_token_accuracy": 0.5774711936712265,
1739
+ "num_tokens": 2238139.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.544056522846222,
1744
+ "epoch": 1.891304347826087,
1745
+ "grad_norm": 1.1214172840118408,
1746
+ "learning_rate": 9.480847235252361e-05,
1747
+ "loss": 1.2268,
1748
+ "mean_token_accuracy": 0.5613886952400208,
1749
+ "num_tokens": 2250928.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 1.5295695900917052,
1754
+ "epoch": 1.9021739130434783,
1755
+ "grad_norm": 1.1650352478027344,
1756
+ "learning_rate": 9.472843692524363e-05,
1757
+ "loss": 1.1573,
1758
+ "mean_token_accuracy": 0.5787465155124665,
1759
+ "num_tokens": 2263338.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 1.5347764611244201,
1764
+ "epoch": 1.9130434782608696,
1765
+ "grad_norm": 1.0249896049499512,
1766
+ "learning_rate": 9.464782358806383e-05,
1767
+ "loss": 1.1731,
1768
+ "mean_token_accuracy": 0.5780636668205261,
1769
+ "num_tokens": 2276200.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 1.5715635061264037,
1774
+ "epoch": 1.9239130434782608,
1775
+ "grad_norm": 1.0768051147460938,
1776
+ "learning_rate": 9.45666333825419e-05,
1777
+ "loss": 1.2585,
1778
+ "mean_token_accuracy": 0.5452336609363556,
1779
+ "num_tokens": 2289088.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 1.5402274131774902,
1784
+ "epoch": 1.9347826086956523,
1785
+ "grad_norm": 1.0846654176712036,
1786
+ "learning_rate": 9.448486735768884e-05,
1787
+ "loss": 1.1918,
1788
+ "mean_token_accuracy": 0.5699589729309082,
1789
+ "num_tokens": 2302544.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 1.5048401594161986,
1794
+ "epoch": 1.9456521739130435,
1795
+ "grad_norm": 1.1533433198928833,
1796
+ "learning_rate": 9.440252656995551e-05,
1797
+ "loss": 1.1792,
1798
+ "mean_token_accuracy": 0.5685461640357972,
1799
+ "num_tokens": 2315473.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 1.5128441214561463,
1804
+ "epoch": 1.9565217391304348,
1805
+ "grad_norm": 1.2847894430160522,
1806
+ "learning_rate": 9.431961208321892e-05,
1807
+ "loss": 1.1566,
1808
+ "mean_token_accuracy": 0.5870453357696533,
1809
+ "num_tokens": 2329176.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 1.5362990856170655,
1814
+ "epoch": 1.9673913043478262,
1815
+ "grad_norm": 1.2497868537902832,
1816
+ "learning_rate": 9.423612496876855e-05,
1817
+ "loss": 1.1896,
1818
+ "mean_token_accuracy": 0.5719706892967225,
1819
+ "num_tokens": 2341591.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 1.5580734014511108,
1824
+ "epoch": 1.9782608695652173,
1825
+ "grad_norm": 1.1140056848526,
1826
+ "learning_rate": 9.415206630529241e-05,
1827
+ "loss": 1.2434,
1828
+ "mean_token_accuracy": 0.5461874425411224,
1829
+ "num_tokens": 2354577.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 1.5499179720878602,
1834
+ "epoch": 1.9891304347826086,
1835
+ "grad_norm": 1.0708650350570679,
1836
+ "learning_rate": 9.406743717886321e-05,
1837
+ "loss": 1.1635,
1838
+ "mean_token_accuracy": 0.5835445284843445,
1839
+ "num_tokens": 2366934.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 1.5282660722732544,
1844
+ "epoch": 2.0,
1845
+ "grad_norm": 0.9982873797416687,
1846
+ "learning_rate": 9.398223868292424e-05,
1847
+ "loss": 1.162,
1848
+ "mean_token_accuracy": 0.5795026063919068,
1849
+ "num_tokens": 2379988.0,
1850
+ "step": 1840
1851
+ }
1852
+ ],
1853
+ "logging_steps": 10,
1854
+ "max_steps": 9200,
1855
+ "num_input_tokens_seen": 0,
1856
+ "num_train_epochs": 10,
1857
+ "save_steps": 500,
1858
+ "stateful_callbacks": {
1859
+ "TrainerControl": {
1860
+ "args": {
1861
+ "should_epoch_stop": false,
1862
+ "should_evaluate": false,
1863
+ "should_log": false,
1864
+ "should_save": true,
1865
+ "should_training_stop": false
1866
+ },
1867
+ "attributes": {}
1868
+ }
1869
+ },
1870
+ "total_flos": 1.0125617378081587e+17,
1871
+ "train_batch_size": 8,
1872
+ "trial_name": null,
1873
+ "trial_params": null
1874
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-1840/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe3124146d4d0372b460431bd6b6771f3c9e6e5f34e80127ceca6056b0fbd2b2
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/trainer_state.json ADDED
@@ -0,0 +1,2794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2760,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2980000615119933,
14
+ "epoch": 0.010869565217391304,
15
+ "grad_norm": 7.514286994934082,
16
+ "learning_rate": 1.956521739130435e-06,
17
+ "loss": 1.8548,
18
+ "mean_token_accuracy": 0.5365569293498993,
19
+ "num_tokens": 13273.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.3184159398078918,
24
+ "epoch": 0.021739130434782608,
25
+ "grad_norm": 6.582128524780273,
26
+ "learning_rate": 4.130434782608695e-06,
27
+ "loss": 1.9416,
28
+ "mean_token_accuracy": 0.5010036021471024,
29
+ "num_tokens": 26299.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.302778995037079,
34
+ "epoch": 0.03260869565217391,
35
+ "grad_norm": 6.661994457244873,
36
+ "learning_rate": 6.304347826086957e-06,
37
+ "loss": 1.7644,
38
+ "mean_token_accuracy": 0.5327741354703903,
39
+ "num_tokens": 39608.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.355496096611023,
44
+ "epoch": 0.043478260869565216,
45
+ "grad_norm": 2.829239845275879,
46
+ "learning_rate": 8.478260869565217e-06,
47
+ "loss": 1.5473,
48
+ "mean_token_accuracy": 0.5216561764478683,
49
+ "num_tokens": 52279.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.4233315467834473,
54
+ "epoch": 0.05434782608695652,
55
+ "grad_norm": 1.384964108467102,
56
+ "learning_rate": 1.0652173913043479e-05,
57
+ "loss": 1.3512,
58
+ "mean_token_accuracy": 0.5346131652593613,
59
+ "num_tokens": 65371.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.4783715605735779,
64
+ "epoch": 0.06521739130434782,
65
+ "grad_norm": 1.2184863090515137,
66
+ "learning_rate": 1.2826086956521741e-05,
67
+ "loss": 1.3353,
68
+ "mean_token_accuracy": 0.5265826016664505,
69
+ "num_tokens": 78549.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.4811665654182433,
74
+ "epoch": 0.07608695652173914,
75
+ "grad_norm": 0.8817082047462463,
76
+ "learning_rate": 1.5e-05,
77
+ "loss": 1.2885,
78
+ "mean_token_accuracy": 0.5369167566299439,
79
+ "num_tokens": 91168.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.485330879688263,
84
+ "epoch": 0.08695652173913043,
85
+ "grad_norm": 1.0375007390975952,
86
+ "learning_rate": 1.7173913043478263e-05,
87
+ "loss": 1.3207,
88
+ "mean_token_accuracy": 0.5182694345712662,
89
+ "num_tokens": 104210.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.4509355902671814,
94
+ "epoch": 0.09782608695652174,
95
+ "grad_norm": 0.866616427898407,
96
+ "learning_rate": 1.9347826086956523e-05,
97
+ "loss": 1.2442,
98
+ "mean_token_accuracy": 0.5508454263210296,
99
+ "num_tokens": 117342.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.4595998883247376,
104
+ "epoch": 0.10869565217391304,
105
+ "grad_norm": 0.9921526312828064,
106
+ "learning_rate": 2.1521739130434784e-05,
107
+ "loss": 1.2513,
108
+ "mean_token_accuracy": 0.5439675092697144,
109
+ "num_tokens": 130168.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.468259596824646,
114
+ "epoch": 0.11956521739130435,
115
+ "grad_norm": 0.8542688488960266,
116
+ "learning_rate": 2.3695652173913045e-05,
117
+ "loss": 1.2523,
118
+ "mean_token_accuracy": 0.5456153243780136,
119
+ "num_tokens": 143277.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.4652462244033813,
124
+ "epoch": 0.13043478260869565,
125
+ "grad_norm": 0.8958607316017151,
126
+ "learning_rate": 2.5869565217391305e-05,
127
+ "loss": 1.2564,
128
+ "mean_token_accuracy": 0.5374186933040619,
129
+ "num_tokens": 155929.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.4442671895027162,
134
+ "epoch": 0.14130434782608695,
135
+ "grad_norm": 1.0437828302383423,
136
+ "learning_rate": 2.8043478260869566e-05,
137
+ "loss": 1.2463,
138
+ "mean_token_accuracy": 0.5506911396980285,
139
+ "num_tokens": 168922.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.442794382572174,
144
+ "epoch": 0.15217391304347827,
145
+ "grad_norm": 1.1950273513793945,
146
+ "learning_rate": 3.0217391304347827e-05,
147
+ "loss": 1.2343,
148
+ "mean_token_accuracy": 0.561489287018776,
149
+ "num_tokens": 181883.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.4483441829681396,
154
+ "epoch": 0.16304347826086957,
155
+ "grad_norm": 1.27411687374115,
156
+ "learning_rate": 3.239130434782609e-05,
157
+ "loss": 1.2515,
158
+ "mean_token_accuracy": 0.5461658954620361,
159
+ "num_tokens": 194847.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.4531602740287781,
164
+ "epoch": 0.17391304347826086,
165
+ "grad_norm": 0.9844512343406677,
166
+ "learning_rate": 3.456521739130435e-05,
167
+ "loss": 1.2379,
168
+ "mean_token_accuracy": 0.5450588703155518,
169
+ "num_tokens": 207431.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.4460819005966186,
174
+ "epoch": 0.18478260869565216,
175
+ "grad_norm": 0.965182363986969,
176
+ "learning_rate": 3.673913043478261e-05,
177
+ "loss": 1.2497,
178
+ "mean_token_accuracy": 0.5442000389099121,
179
+ "num_tokens": 220382.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.4696384787559509,
184
+ "epoch": 0.1956521739130435,
185
+ "grad_norm": 0.8425037860870361,
186
+ "learning_rate": 3.8913043478260866e-05,
187
+ "loss": 1.2847,
188
+ "mean_token_accuracy": 0.5304420441389084,
189
+ "num_tokens": 232940.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.4491984724998475,
194
+ "epoch": 0.20652173913043478,
195
+ "grad_norm": 1.1692280769348145,
196
+ "learning_rate": 4.1086956521739134e-05,
197
+ "loss": 1.2342,
198
+ "mean_token_accuracy": 0.5570813834667205,
199
+ "num_tokens": 245747.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.466271436214447,
204
+ "epoch": 0.21739130434782608,
205
+ "grad_norm": 1.0157368183135986,
206
+ "learning_rate": 4.3260869565217394e-05,
207
+ "loss": 1.2499,
208
+ "mean_token_accuracy": 0.5432725459337234,
209
+ "num_tokens": 258696.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.4768565893173218,
214
+ "epoch": 0.22826086956521738,
215
+ "grad_norm": 1.109692096710205,
216
+ "learning_rate": 4.5434782608695655e-05,
217
+ "loss": 1.2343,
218
+ "mean_token_accuracy": 0.5567020237445831,
219
+ "num_tokens": 271378.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.4473167181015014,
224
+ "epoch": 0.2391304347826087,
225
+ "grad_norm": 0.850563108921051,
226
+ "learning_rate": 4.7608695652173916e-05,
227
+ "loss": 1.1959,
228
+ "mean_token_accuracy": 0.5724921762943268,
229
+ "num_tokens": 284704.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.4478083968162536,
234
+ "epoch": 0.25,
235
+ "grad_norm": 1.0289748907089233,
236
+ "learning_rate": 4.9782608695652176e-05,
237
+ "loss": 1.2392,
238
+ "mean_token_accuracy": 0.5519216269254684,
239
+ "num_tokens": 296961.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.4915278434753418,
244
+ "epoch": 0.2608695652173913,
245
+ "grad_norm": 1.3161778450012207,
246
+ "learning_rate": 5.195652173913044e-05,
247
+ "loss": 1.2539,
248
+ "mean_token_accuracy": 0.5443875581026077,
249
+ "num_tokens": 310082.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.4435262322425841,
254
+ "epoch": 0.2717391304347826,
255
+ "grad_norm": 1.2697113752365112,
256
+ "learning_rate": 5.41304347826087e-05,
257
+ "loss": 1.1911,
258
+ "mean_token_accuracy": 0.576522421836853,
259
+ "num_tokens": 323044.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.4607349276542663,
264
+ "epoch": 0.2826086956521739,
265
+ "grad_norm": 0.8006339073181152,
266
+ "learning_rate": 5.630434782608696e-05,
267
+ "loss": 1.2088,
268
+ "mean_token_accuracy": 0.5584357857704163,
269
+ "num_tokens": 336108.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.4630830883979797,
274
+ "epoch": 0.29347826086956524,
275
+ "grad_norm": 0.8462095856666565,
276
+ "learning_rate": 5.847826086956521e-05,
277
+ "loss": 1.2458,
278
+ "mean_token_accuracy": 0.5520103573799133,
279
+ "num_tokens": 349210.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.458599328994751,
284
+ "epoch": 0.30434782608695654,
285
+ "grad_norm": 0.930942177772522,
286
+ "learning_rate": 6.0652173913043487e-05,
287
+ "loss": 1.2219,
288
+ "mean_token_accuracy": 0.5603324949741364,
289
+ "num_tokens": 361465.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.4730467200279236,
294
+ "epoch": 0.31521739130434784,
295
+ "grad_norm": 0.9836443066596985,
296
+ "learning_rate": 6.282608695652175e-05,
297
+ "loss": 1.2493,
298
+ "mean_token_accuracy": 0.5466845005750656,
299
+ "num_tokens": 374931.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.4596371173858642,
304
+ "epoch": 0.32608695652173914,
305
+ "grad_norm": 0.9860939383506775,
306
+ "learning_rate": 6.500000000000001e-05,
307
+ "loss": 1.2174,
308
+ "mean_token_accuracy": 0.556469538807869,
309
+ "num_tokens": 387929.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.4784631490707398,
314
+ "epoch": 0.33695652173913043,
315
+ "grad_norm": 0.8261193037033081,
316
+ "learning_rate": 6.717391304347827e-05,
317
+ "loss": 1.2191,
318
+ "mean_token_accuracy": 0.5600455164909363,
319
+ "num_tokens": 401392.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.4627429723739624,
324
+ "epoch": 0.34782608695652173,
325
+ "grad_norm": 0.896903395652771,
326
+ "learning_rate": 6.934782608695653e-05,
327
+ "loss": 1.1987,
328
+ "mean_token_accuracy": 0.5688268154859543,
329
+ "num_tokens": 414466.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.481223452091217,
334
+ "epoch": 0.358695652173913,
335
+ "grad_norm": 0.9765130877494812,
336
+ "learning_rate": 7.152173913043479e-05,
337
+ "loss": 1.2161,
338
+ "mean_token_accuracy": 0.5661008894443512,
339
+ "num_tokens": 427231.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.5086342811584472,
344
+ "epoch": 0.3695652173913043,
345
+ "grad_norm": 0.8136937022209167,
346
+ "learning_rate": 7.369565217391304e-05,
347
+ "loss": 1.2884,
348
+ "mean_token_accuracy": 0.5307422339916229,
349
+ "num_tokens": 439856.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.4857903122901917,
354
+ "epoch": 0.3804347826086957,
355
+ "grad_norm": 0.913378894329071,
356
+ "learning_rate": 7.58695652173913e-05,
357
+ "loss": 1.2631,
358
+ "mean_token_accuracy": 0.5459983497858047,
359
+ "num_tokens": 452683.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.5006132960319518,
364
+ "epoch": 0.391304347826087,
365
+ "grad_norm": 1.0260237455368042,
366
+ "learning_rate": 7.804347826086957e-05,
367
+ "loss": 1.2587,
368
+ "mean_token_accuracy": 0.5429587304592133,
369
+ "num_tokens": 465274.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.4861261010169984,
374
+ "epoch": 0.40217391304347827,
375
+ "grad_norm": 1.04011869430542,
376
+ "learning_rate": 8.021739130434783e-05,
377
+ "loss": 1.2147,
378
+ "mean_token_accuracy": 0.5620492398738861,
379
+ "num_tokens": 478175.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.4943390011787414,
384
+ "epoch": 0.41304347826086957,
385
+ "grad_norm": 0.9155416488647461,
386
+ "learning_rate": 8.23913043478261e-05,
387
+ "loss": 1.2128,
388
+ "mean_token_accuracy": 0.5667012810707093,
389
+ "num_tokens": 491001.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.5116252064704896,
394
+ "epoch": 0.42391304347826086,
395
+ "grad_norm": 0.8238904476165771,
396
+ "learning_rate": 8.456521739130435e-05,
397
+ "loss": 1.2677,
398
+ "mean_token_accuracy": 0.5370148032903671,
399
+ "num_tokens": 503764.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.4961246132850647,
404
+ "epoch": 0.43478260869565216,
405
+ "grad_norm": 0.8830587863922119,
406
+ "learning_rate": 8.673913043478261e-05,
407
+ "loss": 1.1999,
408
+ "mean_token_accuracy": 0.5743164956569672,
409
+ "num_tokens": 516294.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.5065942287445069,
414
+ "epoch": 0.44565217391304346,
415
+ "grad_norm": 0.9117815494537354,
416
+ "learning_rate": 8.891304347826088e-05,
417
+ "loss": 1.2607,
418
+ "mean_token_accuracy": 0.550678727030754,
419
+ "num_tokens": 529384.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.5079344272613526,
424
+ "epoch": 0.45652173913043476,
425
+ "grad_norm": 0.8730387091636658,
426
+ "learning_rate": 9.108695652173914e-05,
427
+ "loss": 1.2087,
428
+ "mean_token_accuracy": 0.5660586059093475,
429
+ "num_tokens": 542010.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.5147196769714355,
434
+ "epoch": 0.4673913043478261,
435
+ "grad_norm": 0.7791972160339355,
436
+ "learning_rate": 9.32608695652174e-05,
437
+ "loss": 1.2471,
438
+ "mean_token_accuracy": 0.5513437986373901,
439
+ "num_tokens": 554428.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.5182705640792846,
444
+ "epoch": 0.4782608695652174,
445
+ "grad_norm": 0.7569729089736938,
446
+ "learning_rate": 9.543478260869566e-05,
447
+ "loss": 1.2876,
448
+ "mean_token_accuracy": 0.5325394898653031,
449
+ "num_tokens": 567462.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.5036618828773498,
454
+ "epoch": 0.4891304347826087,
455
+ "grad_norm": 0.7794932126998901,
456
+ "learning_rate": 9.760869565217392e-05,
457
+ "loss": 1.2539,
458
+ "mean_token_accuracy": 0.5439064025878906,
459
+ "num_tokens": 580377.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.4947790503501892,
464
+ "epoch": 0.5,
465
+ "grad_norm": 0.8008731007575989,
466
+ "learning_rate": 9.978260869565218e-05,
467
+ "loss": 1.2352,
468
+ "mean_token_accuracy": 0.5524563610553741,
469
+ "num_tokens": 593597.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.5091761827468873,
474
+ "epoch": 0.5108695652173914,
475
+ "grad_norm": 0.9790273904800415,
476
+ "learning_rate": 9.999973836157333e-05,
477
+ "loss": 1.2448,
478
+ "mean_token_accuracy": 0.5587224543094635,
479
+ "num_tokens": 606659.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.5102417111396789,
484
+ "epoch": 0.5217391304347826,
485
+ "grad_norm": 0.9725663065910339,
486
+ "learning_rate": 9.999883393595947e-05,
487
+ "loss": 1.2366,
488
+ "mean_token_accuracy": 0.555170550942421,
489
+ "num_tokens": 619406.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.5230854153633118,
494
+ "epoch": 0.532608695652174,
495
+ "grad_norm": 1.0150320529937744,
496
+ "learning_rate": 9.999728350473721e-05,
497
+ "loss": 1.2304,
498
+ "mean_token_accuracy": 0.5601270943880081,
499
+ "num_tokens": 632194.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.5028501629829407,
504
+ "epoch": 0.5434782608695652,
505
+ "grad_norm": 0.8656931519508362,
506
+ "learning_rate": 9.99950870879387e-05,
507
+ "loss": 1.2286,
508
+ "mean_token_accuracy": 0.5571267485618592,
509
+ "num_tokens": 645327.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.5302343845367432,
514
+ "epoch": 0.5543478260869565,
515
+ "grad_norm": 0.7537740468978882,
516
+ "learning_rate": 9.99922447139426e-05,
517
+ "loss": 1.2342,
518
+ "mean_token_accuracy": 0.5612987399101257,
519
+ "num_tokens": 658378.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.5025633692741394,
524
+ "epoch": 0.5652173913043478,
525
+ "grad_norm": 0.6478719115257263,
526
+ "learning_rate": 9.998875641947354e-05,
527
+ "loss": 1.2429,
528
+ "mean_token_accuracy": 0.5501718163490296,
529
+ "num_tokens": 671323.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.5004254579544067,
534
+ "epoch": 0.5760869565217391,
535
+ "grad_norm": 1.2102432250976562,
536
+ "learning_rate": 9.998462224960175e-05,
537
+ "loss": 1.213,
538
+ "mean_token_accuracy": 0.5621294498443603,
539
+ "num_tokens": 683878.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.5195318818092347,
544
+ "epoch": 0.5869565217391305,
545
+ "grad_norm": 0.7961319088935852,
546
+ "learning_rate": 9.997984225774238e-05,
547
+ "loss": 1.2492,
548
+ "mean_token_accuracy": 0.5559745967388153,
549
+ "num_tokens": 696935.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.5280181407928466,
554
+ "epoch": 0.5978260869565217,
555
+ "grad_norm": 0.8740176558494568,
556
+ "learning_rate": 9.99744165056549e-05,
557
+ "loss": 1.2197,
558
+ "mean_token_accuracy": 0.5634395360946656,
559
+ "num_tokens": 710020.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.5327624678611755,
564
+ "epoch": 0.6086956521739131,
565
+ "grad_norm": 0.8462045192718506,
566
+ "learning_rate": 9.99683450634423e-05,
567
+ "loss": 1.2192,
568
+ "mean_token_accuracy": 0.5612434148788452,
569
+ "num_tokens": 723303.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.5094027161598205,
574
+ "epoch": 0.6195652173913043,
575
+ "grad_norm": 0.9465392231941223,
576
+ "learning_rate": 9.996162800955011e-05,
577
+ "loss": 1.1817,
578
+ "mean_token_accuracy": 0.5782815992832184,
579
+ "num_tokens": 735527.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.5501951813697814,
584
+ "epoch": 0.6304347826086957,
585
+ "grad_norm": 0.7445736527442932,
586
+ "learning_rate": 9.995426543076545e-05,
587
+ "loss": 1.2452,
588
+ "mean_token_accuracy": 0.5505437403917313,
589
+ "num_tokens": 748455.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.5227739214897156,
594
+ "epoch": 0.6413043478260869,
595
+ "grad_norm": 0.8378339409828186,
596
+ "learning_rate": 9.994625742221586e-05,
597
+ "loss": 1.2551,
598
+ "mean_token_accuracy": 0.5548771649599076,
599
+ "num_tokens": 761420.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.5428736090660096,
604
+ "epoch": 0.6521739130434783,
605
+ "grad_norm": 0.9249877333641052,
606
+ "learning_rate": 9.993760408736814e-05,
607
+ "loss": 1.282,
608
+ "mean_token_accuracy": 0.5393997848033905,
609
+ "num_tokens": 773676.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.5630847930908203,
614
+ "epoch": 0.6630434782608695,
615
+ "grad_norm": 0.8152625560760498,
616
+ "learning_rate": 9.992830553802696e-05,
617
+ "loss": 1.2763,
618
+ "mean_token_accuracy": 0.5402287989854813,
619
+ "num_tokens": 786757.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.532595145702362,
624
+ "epoch": 0.6739130434782609,
625
+ "grad_norm": 0.7313966751098633,
626
+ "learning_rate": 9.991836189433342e-05,
627
+ "loss": 1.2323,
628
+ "mean_token_accuracy": 0.5645015567541123,
629
+ "num_tokens": 799851.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.5160420179367065,
634
+ "epoch": 0.6847826086956522,
635
+ "grad_norm": 0.7158486843109131,
636
+ "learning_rate": 9.990777328476348e-05,
637
+ "loss": 1.2021,
638
+ "mean_token_accuracy": 0.555733984708786,
639
+ "num_tokens": 812648.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.5084859609603882,
644
+ "epoch": 0.6956521739130435,
645
+ "grad_norm": 0.7056333422660828,
646
+ "learning_rate": 9.98965398461264e-05,
647
+ "loss": 1.176,
648
+ "mean_token_accuracy": 0.5772889316082,
649
+ "num_tokens": 825054.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.5172175765037537,
654
+ "epoch": 0.7065217391304348,
655
+ "grad_norm": 0.8173061013221741,
656
+ "learning_rate": 9.988466172356282e-05,
657
+ "loss": 1.1893,
658
+ "mean_token_accuracy": 0.5774871349334717,
659
+ "num_tokens": 838148.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.5155822157859802,
664
+ "epoch": 0.717391304347826,
665
+ "grad_norm": 0.7483378648757935,
666
+ "learning_rate": 9.9872139070543e-05,
667
+ "loss": 1.2377,
668
+ "mean_token_accuracy": 0.5529649972915649,
669
+ "num_tokens": 851079.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.5392202258110046,
674
+ "epoch": 0.7282608695652174,
675
+ "grad_norm": 0.8020080924034119,
676
+ "learning_rate": 9.985897204886481e-05,
677
+ "loss": 1.2471,
678
+ "mean_token_accuracy": 0.5591055184602738,
679
+ "num_tokens": 863673.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.520468044281006,
684
+ "epoch": 0.7391304347826086,
685
+ "grad_norm": 0.7957432866096497,
686
+ "learning_rate": 9.984516082865159e-05,
687
+ "loss": 1.2582,
688
+ "mean_token_accuracy": 0.5404952645301819,
689
+ "num_tokens": 876764.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.5024335741996766,
694
+ "epoch": 0.75,
695
+ "grad_norm": 0.8745436072349548,
696
+ "learning_rate": 9.983070558835002e-05,
697
+ "loss": 1.2029,
698
+ "mean_token_accuracy": 0.5673643052577972,
699
+ "num_tokens": 889851.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.5227225780487061,
704
+ "epoch": 0.7608695652173914,
705
+ "grad_norm": 0.7866286039352417,
706
+ "learning_rate": 9.981560651472781e-05,
707
+ "loss": 1.2597,
708
+ "mean_token_accuracy": 0.5447615504264831,
709
+ "num_tokens": 903182.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.5330517888069153,
714
+ "epoch": 0.7717391304347826,
715
+ "grad_norm": 0.696030855178833,
716
+ "learning_rate": 9.97998638028712e-05,
717
+ "loss": 1.2417,
718
+ "mean_token_accuracy": 0.5569504171609878,
719
+ "num_tokens": 916564.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.5023605585098267,
724
+ "epoch": 0.782608695652174,
725
+ "grad_norm": 0.7980480194091797,
726
+ "learning_rate": 9.978347765618257e-05,
727
+ "loss": 1.2073,
728
+ "mean_token_accuracy": 0.562690931558609,
729
+ "num_tokens": 929820.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.5466702461242676,
734
+ "epoch": 0.7934782608695652,
735
+ "grad_norm": 0.8441147804260254,
736
+ "learning_rate": 9.976644828637767e-05,
737
+ "loss": 1.2859,
738
+ "mean_token_accuracy": 0.5330282121896743,
739
+ "num_tokens": 942449.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.515641415119171,
744
+ "epoch": 0.8043478260869565,
745
+ "grad_norm": 0.8833957314491272,
746
+ "learning_rate": 9.974877591348304e-05,
747
+ "loss": 1.2627,
748
+ "mean_token_accuracy": 0.5418030679225921,
749
+ "num_tokens": 955620.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 1.5292868852615356,
754
+ "epoch": 0.8152173913043478,
755
+ "grad_norm": 0.8666150569915771,
756
+ "learning_rate": 9.973046076583301e-05,
757
+ "loss": 1.2364,
758
+ "mean_token_accuracy": 0.5494832009077072,
759
+ "num_tokens": 968954.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.5135694026947022,
764
+ "epoch": 0.8260869565217391,
765
+ "grad_norm": 0.9172241687774658,
766
+ "learning_rate": 9.97115030800669e-05,
767
+ "loss": 1.2053,
768
+ "mean_token_accuracy": 0.5607668071985245,
769
+ "num_tokens": 981323.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.5214222908020019,
774
+ "epoch": 0.8369565217391305,
775
+ "grad_norm": 0.9353718161582947,
776
+ "learning_rate": 9.969190310112579e-05,
777
+ "loss": 1.225,
778
+ "mean_token_accuracy": 0.5599299073219299,
779
+ "num_tokens": 994834.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.5419185280799865,
784
+ "epoch": 0.8478260869565217,
785
+ "grad_norm": 0.717232882976532,
786
+ "learning_rate": 9.967166108224957e-05,
787
+ "loss": 1.2848,
788
+ "mean_token_accuracy": 0.5360999226570129,
789
+ "num_tokens": 1007806.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.545407807826996,
794
+ "epoch": 0.8586956521739131,
795
+ "grad_norm": 0.745928943157196,
796
+ "learning_rate": 9.965077728497348e-05,
797
+ "loss": 1.2683,
798
+ "mean_token_accuracy": 0.5427737534046173,
799
+ "num_tokens": 1021093.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.5416621446609498,
804
+ "epoch": 0.8695652173913043,
805
+ "grad_norm": 0.8545331954956055,
806
+ "learning_rate": 9.96292519791248e-05,
807
+ "loss": 1.3036,
808
+ "mean_token_accuracy": 0.5352708637714386,
809
+ "num_tokens": 1034317.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.516196882724762,
814
+ "epoch": 0.8804347826086957,
815
+ "grad_norm": 0.8239868879318237,
816
+ "learning_rate": 9.96070854428194e-05,
817
+ "loss": 1.1943,
818
+ "mean_token_accuracy": 0.568702632188797,
819
+ "num_tokens": 1047679.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.5478002548217773,
824
+ "epoch": 0.8913043478260869,
825
+ "grad_norm": 0.9187906980514526,
826
+ "learning_rate": 9.958427796245808e-05,
827
+ "loss": 1.2707,
828
+ "mean_token_accuracy": 0.5460701882839203,
829
+ "num_tokens": 1060840.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.540980589389801,
834
+ "epoch": 0.9021739130434783,
835
+ "grad_norm": 0.774869978427887,
836
+ "learning_rate": 9.956082983272293e-05,
837
+ "loss": 1.2397,
838
+ "mean_token_accuracy": 0.5464379012584686,
839
+ "num_tokens": 1073529.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.5131322622299195,
844
+ "epoch": 0.9130434782608695,
845
+ "grad_norm": 1.029721975326538,
846
+ "learning_rate": 9.953674135657345e-05,
847
+ "loss": 1.2198,
848
+ "mean_token_accuracy": 0.5641603857278824,
849
+ "num_tokens": 1086600.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.5259755611419679,
854
+ "epoch": 0.9239130434782609,
855
+ "grad_norm": 0.8057295083999634,
856
+ "learning_rate": 9.951201284524275e-05,
857
+ "loss": 1.2492,
858
+ "mean_token_accuracy": 0.5562368750572204,
859
+ "num_tokens": 1099737.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.5159748077392579,
864
+ "epoch": 0.9347826086956522,
865
+ "grad_norm": 0.6001420617103577,
866
+ "learning_rate": 9.94866446182334e-05,
867
+ "loss": 1.2524,
868
+ "mean_token_accuracy": 0.5458084315061569,
869
+ "num_tokens": 1112239.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 1.524741494655609,
874
+ "epoch": 0.9456521739130435,
875
+ "grad_norm": 0.847523033618927,
876
+ "learning_rate": 9.94606370033134e-05,
877
+ "loss": 1.2245,
878
+ "mean_token_accuracy": 0.5601188719272614,
879
+ "num_tokens": 1125191.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.5371912598609925,
884
+ "epoch": 0.9565217391304348,
885
+ "grad_norm": 0.767745852470398,
886
+ "learning_rate": 9.943399033651189e-05,
887
+ "loss": 1.2319,
888
+ "mean_token_accuracy": 0.5546965420246124,
889
+ "num_tokens": 1138077.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.5223184943199157,
894
+ "epoch": 0.967391304347826,
895
+ "grad_norm": 0.9313151836395264,
896
+ "learning_rate": 9.94067049621148e-05,
897
+ "loss": 1.2237,
898
+ "mean_token_accuracy": 0.5578917026519775,
899
+ "num_tokens": 1151364.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.530699372291565,
904
+ "epoch": 0.9782608695652174,
905
+ "grad_norm": 0.7053420543670654,
906
+ "learning_rate": 9.937878123266044e-05,
907
+ "loss": 1.2269,
908
+ "mean_token_accuracy": 0.5488695651292801,
909
+ "num_tokens": 1164326.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.5197353124618531,
914
+ "epoch": 0.9891304347826086,
915
+ "grad_norm": 0.9986150860786438,
916
+ "learning_rate": 9.9350219508935e-05,
917
+ "loss": 1.2106,
918
+ "mean_token_accuracy": 0.5582584798336029,
919
+ "num_tokens": 1176914.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.5376808762550354,
924
+ "epoch": 1.0,
925
+ "grad_norm": 0.7129160165786743,
926
+ "learning_rate": 9.93210201599677e-05,
927
+ "loss": 1.2377,
928
+ "mean_token_accuracy": 0.557282817363739,
929
+ "num_tokens": 1189994.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.5522505402565003,
934
+ "epoch": 1.0108695652173914,
935
+ "grad_norm": 0.9139987230300903,
936
+ "learning_rate": 9.929118356302621e-05,
937
+ "loss": 1.2492,
938
+ "mean_token_accuracy": 0.5444983661174774,
939
+ "num_tokens": 1202961.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.5519829273223877,
944
+ "epoch": 1.0217391304347827,
945
+ "grad_norm": 1.0422664880752563,
946
+ "learning_rate": 9.926071010361173e-05,
947
+ "loss": 1.1957,
948
+ "mean_token_accuracy": 0.5779279708862305,
949
+ "num_tokens": 1215901.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.5434082865715026,
954
+ "epoch": 1.0326086956521738,
955
+ "grad_norm": 1.0472567081451416,
956
+ "learning_rate": 9.922960017545395e-05,
957
+ "loss": 1.2263,
958
+ "mean_token_accuracy": 0.5640866041183472,
959
+ "num_tokens": 1228567.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 1.5352994203567505,
964
+ "epoch": 1.0434782608695652,
965
+ "grad_norm": 1.0810585021972656,
966
+ "learning_rate": 9.919785418050598e-05,
967
+ "loss": 1.1876,
968
+ "mean_token_accuracy": 0.5709751307964325,
969
+ "num_tokens": 1241529.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 1.4996863842010497,
974
+ "epoch": 1.0543478260869565,
975
+ "grad_norm": 1.1204661130905151,
976
+ "learning_rate": 9.916547252893923e-05,
977
+ "loss": 1.1354,
978
+ "mean_token_accuracy": 0.5961336076259613,
979
+ "num_tokens": 1254137.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 1.5315279841423035,
984
+ "epoch": 1.065217391304348,
985
+ "grad_norm": 1.0741767883300781,
986
+ "learning_rate": 9.9132455639138e-05,
987
+ "loss": 1.1422,
988
+ "mean_token_accuracy": 0.5875493228435517,
989
+ "num_tokens": 1266871.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 1.516059410572052,
994
+ "epoch": 1.0760869565217392,
995
+ "grad_norm": 1.1965429782867432,
996
+ "learning_rate": 9.90988039376942e-05,
997
+ "loss": 1.1438,
998
+ "mean_token_accuracy": 0.5906685352325439,
999
+ "num_tokens": 1279655.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 1.5148675918579102,
1004
+ "epoch": 1.0869565217391304,
1005
+ "grad_norm": 1.1992353200912476,
1006
+ "learning_rate": 9.906451785940167e-05,
1007
+ "loss": 1.1636,
1008
+ "mean_token_accuracy": 0.5710582077503205,
1009
+ "num_tokens": 1292202.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 1.5187682271003724,
1014
+ "epoch": 1.0978260869565217,
1015
+ "grad_norm": 1.0606764554977417,
1016
+ "learning_rate": 9.902959784725077e-05,
1017
+ "loss": 1.1763,
1018
+ "mean_token_accuracy": 0.5760969400405884,
1019
+ "num_tokens": 1305284.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 1.5265469312667848,
1024
+ "epoch": 1.108695652173913,
1025
+ "grad_norm": 1.02944815158844,
1026
+ "learning_rate": 9.899404435242246e-05,
1027
+ "loss": 1.2096,
1028
+ "mean_token_accuracy": 0.5624277234077454,
1029
+ "num_tokens": 1318408.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 1.5455774545669556,
1034
+ "epoch": 1.1195652173913044,
1035
+ "grad_norm": 1.1493759155273438,
1036
+ "learning_rate": 9.895785783428262e-05,
1037
+ "loss": 1.1652,
1038
+ "mean_token_accuracy": 0.5867336988449097,
1039
+ "num_tokens": 1331156.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 1.5371973156929015,
1044
+ "epoch": 1.1304347826086956,
1045
+ "grad_norm": 0.9468239545822144,
1046
+ "learning_rate": 9.8921038760376e-05,
1047
+ "loss": 1.2371,
1048
+ "mean_token_accuracy": 0.5544474184513092,
1049
+ "num_tokens": 1343904.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 1.5403631925582886,
1054
+ "epoch": 1.141304347826087,
1055
+ "grad_norm": 1.1717609167099,
1056
+ "learning_rate": 9.888358760642029e-05,
1057
+ "loss": 1.1394,
1058
+ "mean_token_accuracy": 0.5933512449264526,
1059
+ "num_tokens": 1356797.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 1.5518387794494628,
1064
+ "epoch": 1.1521739130434783,
1065
+ "grad_norm": 1.2024801969528198,
1066
+ "learning_rate": 9.884550485629987e-05,
1067
+ "loss": 1.2065,
1068
+ "mean_token_accuracy": 0.5667118012905121,
1069
+ "num_tokens": 1369690.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 1.5736596703529357,
1074
+ "epoch": 1.1630434782608696,
1075
+ "grad_norm": 1.0323596000671387,
1076
+ "learning_rate": 9.88067910020596e-05,
1077
+ "loss": 1.2124,
1078
+ "mean_token_accuracy": 0.5691272497177124,
1079
+ "num_tokens": 1382561.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 1.57814359664917,
1084
+ "epoch": 1.1739130434782608,
1085
+ "grad_norm": 1.1128944158554077,
1086
+ "learning_rate": 9.876744654389854e-05,
1087
+ "loss": 1.2319,
1088
+ "mean_token_accuracy": 0.554848113656044,
1089
+ "num_tokens": 1395409.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 1.5651036262512208,
1094
+ "epoch": 1.184782608695652,
1095
+ "grad_norm": 1.1131497621536255,
1096
+ "learning_rate": 9.872747199016328e-05,
1097
+ "loss": 1.1995,
1098
+ "mean_token_accuracy": 0.5680587291717529,
1099
+ "num_tokens": 1408511.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 1.519801914691925,
1104
+ "epoch": 1.1956521739130435,
1105
+ "grad_norm": 0.8381641507148743,
1106
+ "learning_rate": 9.868686785734165e-05,
1107
+ "loss": 1.1729,
1108
+ "mean_token_accuracy": 0.5780038118362427,
1109
+ "num_tokens": 1421328.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 1.5411308765411378,
1114
+ "epoch": 1.2065217391304348,
1115
+ "grad_norm": 1.1784008741378784,
1116
+ "learning_rate": 9.86456346700558e-05,
1117
+ "loss": 1.2026,
1118
+ "mean_token_accuracy": 0.5581619143486023,
1119
+ "num_tokens": 1434644.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 1.524932038784027,
1124
+ "epoch": 1.2173913043478262,
1125
+ "grad_norm": 0.9289618730545044,
1126
+ "learning_rate": 9.860377296105556e-05,
1127
+ "loss": 1.219,
1128
+ "mean_token_accuracy": 0.557993471622467,
1129
+ "num_tokens": 1447469.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 1.5029574513435364,
1134
+ "epoch": 1.2282608695652173,
1135
+ "grad_norm": 1.0168135166168213,
1136
+ "learning_rate": 9.856128327121155e-05,
1137
+ "loss": 1.1589,
1138
+ "mean_token_accuracy": 0.578672569990158,
1139
+ "num_tokens": 1460202.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 1.5095925211906434,
1144
+ "epoch": 1.2391304347826086,
1145
+ "grad_norm": 1.052454948425293,
1146
+ "learning_rate": 9.85181661495081e-05,
1147
+ "loss": 1.2232,
1148
+ "mean_token_accuracy": 0.5522898703813552,
1149
+ "num_tokens": 1473114.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 1.5059074401855468,
1154
+ "epoch": 1.25,
1155
+ "grad_norm": 1.20883309841156,
1156
+ "learning_rate": 9.847442215303626e-05,
1157
+ "loss": 1.2172,
1158
+ "mean_token_accuracy": 0.5659465253353119,
1159
+ "num_tokens": 1485990.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 1.494919514656067,
1164
+ "epoch": 1.2608695652173914,
1165
+ "grad_norm": 1.1653634309768677,
1166
+ "learning_rate": 9.843005184698655e-05,
1167
+ "loss": 1.1817,
1168
+ "mean_token_accuracy": 0.5764101088047028,
1169
+ "num_tokens": 1498939.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 1.5184181690216065,
1174
+ "epoch": 1.2717391304347827,
1175
+ "grad_norm": 1.1174242496490479,
1176
+ "learning_rate": 9.838505580464168e-05,
1177
+ "loss": 1.1976,
1178
+ "mean_token_accuracy": 0.5707351744174958,
1179
+ "num_tokens": 1511943.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 1.5217233657836915,
1184
+ "epoch": 1.2826086956521738,
1185
+ "grad_norm": 1.0029795169830322,
1186
+ "learning_rate": 9.833943460736912e-05,
1187
+ "loss": 1.2296,
1188
+ "mean_token_accuracy": 0.5572409898042678,
1189
+ "num_tokens": 1525135.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 1.514461922645569,
1194
+ "epoch": 1.2934782608695652,
1195
+ "grad_norm": 1.2473056316375732,
1196
+ "learning_rate": 9.829318884461359e-05,
1197
+ "loss": 1.221,
1198
+ "mean_token_accuracy": 0.5566778779029846,
1199
+ "num_tokens": 1537699.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 1.5298507332801818,
1204
+ "epoch": 1.3043478260869565,
1205
+ "grad_norm": 1.068049430847168,
1206
+ "learning_rate": 9.824631911388948e-05,
1207
+ "loss": 1.248,
1208
+ "mean_token_accuracy": 0.5430671572685242,
1209
+ "num_tokens": 1550938.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 1.5463980197906495,
1214
+ "epoch": 1.315217391304348,
1215
+ "grad_norm": 1.0760388374328613,
1216
+ "learning_rate": 9.819882602077309e-05,
1217
+ "loss": 1.2825,
1218
+ "mean_token_accuracy": 0.5330462247133255,
1219
+ "num_tokens": 1563597.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 1.5457575082778932,
1224
+ "epoch": 1.3260869565217392,
1225
+ "grad_norm": 1.1161272525787354,
1226
+ "learning_rate": 9.815071017889482e-05,
1227
+ "loss": 1.2598,
1228
+ "mean_token_accuracy": 0.543943053483963,
1229
+ "num_tokens": 1576301.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 1.5349620819091796,
1234
+ "epoch": 1.3369565217391304,
1235
+ "grad_norm": 1.1779778003692627,
1236
+ "learning_rate": 9.810197220993123e-05,
1237
+ "loss": 1.2551,
1238
+ "mean_token_accuracy": 0.5386941403150558,
1239
+ "num_tokens": 1589776.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 1.5158817052841187,
1244
+ "epoch": 1.3478260869565217,
1245
+ "grad_norm": 1.1150175333023071,
1246
+ "learning_rate": 9.805261274359705e-05,
1247
+ "loss": 1.193,
1248
+ "mean_token_accuracy": 0.5642519950866699,
1249
+ "num_tokens": 1602239.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 1.512274718284607,
1254
+ "epoch": 1.358695652173913,
1255
+ "grad_norm": 0.9392043948173523,
1256
+ "learning_rate": 9.800263241763698e-05,
1257
+ "loss": 1.2334,
1258
+ "mean_token_accuracy": 0.5577278465032578,
1259
+ "num_tokens": 1615621.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 1.5087523460388184,
1264
+ "epoch": 1.3695652173913042,
1265
+ "grad_norm": 0.9521236419677734,
1266
+ "learning_rate": 9.795203187781751e-05,
1267
+ "loss": 1.1651,
1268
+ "mean_token_accuracy": 0.5836262464523315,
1269
+ "num_tokens": 1628741.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 1.5212602257728576,
1274
+ "epoch": 1.3804347826086958,
1275
+ "grad_norm": 0.9689566493034363,
1276
+ "learning_rate": 9.790081177791852e-05,
1277
+ "loss": 1.1944,
1278
+ "mean_token_accuracy": 0.572248637676239,
1279
+ "num_tokens": 1641646.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 1.521955931186676,
1284
+ "epoch": 1.391304347826087,
1285
+ "grad_norm": 1.016711711883545,
1286
+ "learning_rate": 9.784897277972491e-05,
1287
+ "loss": 1.2105,
1288
+ "mean_token_accuracy": 0.5605559885501862,
1289
+ "num_tokens": 1654499.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 1.5126453638076782,
1294
+ "epoch": 1.4021739130434783,
1295
+ "grad_norm": 1.1951313018798828,
1296
+ "learning_rate": 9.779651555301794e-05,
1297
+ "loss": 1.2305,
1298
+ "mean_token_accuracy": 0.5537042915821075,
1299
+ "num_tokens": 1667748.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 1.528828752040863,
1304
+ "epoch": 1.4130434782608696,
1305
+ "grad_norm": 1.1385231018066406,
1306
+ "learning_rate": 9.77434407755667e-05,
1307
+ "loss": 1.2294,
1308
+ "mean_token_accuracy": 0.554050150513649,
1309
+ "num_tokens": 1681184.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 1.510583758354187,
1314
+ "epoch": 1.4239130434782608,
1315
+ "grad_norm": 1.0576328039169312,
1316
+ "learning_rate": 9.768974913311922e-05,
1317
+ "loss": 1.2674,
1318
+ "mean_token_accuracy": 0.5414516568183899,
1319
+ "num_tokens": 1693818.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 1.5150775551795959,
1324
+ "epoch": 1.434782608695652,
1325
+ "grad_norm": 1.3364728689193726,
1326
+ "learning_rate": 9.763544131939374e-05,
1327
+ "loss": 1.2075,
1328
+ "mean_token_accuracy": 0.559964632987976,
1329
+ "num_tokens": 1706939.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 1.5151704668998718,
1334
+ "epoch": 1.4456521739130435,
1335
+ "grad_norm": 1.02871835231781,
1336
+ "learning_rate": 9.758051803606971e-05,
1337
+ "loss": 1.2487,
1338
+ "mean_token_accuracy": 0.552227908372879,
1339
+ "num_tokens": 1719315.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 1.5152636528015138,
1344
+ "epoch": 1.4565217391304348,
1345
+ "grad_norm": 1.0097824335098267,
1346
+ "learning_rate": 9.75249799927786e-05,
1347
+ "loss": 1.2263,
1348
+ "mean_token_accuracy": 0.5533849179744721,
1349
+ "num_tokens": 1731891.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 1.512537384033203,
1354
+ "epoch": 1.4673913043478262,
1355
+ "grad_norm": 1.2632033824920654,
1356
+ "learning_rate": 9.746882790709491e-05,
1357
+ "loss": 1.222,
1358
+ "mean_token_accuracy": 0.5614925265312195,
1359
+ "num_tokens": 1744427.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 1.5295302748680115,
1364
+ "epoch": 1.4782608695652173,
1365
+ "grad_norm": 1.113368034362793,
1366
+ "learning_rate": 9.741206250452683e-05,
1367
+ "loss": 1.2735,
1368
+ "mean_token_accuracy": 0.539223712682724,
1369
+ "num_tokens": 1757083.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 1.536200964450836,
1374
+ "epoch": 1.4891304347826086,
1375
+ "grad_norm": 1.1522810459136963,
1376
+ "learning_rate": 9.735468451850681e-05,
1377
+ "loss": 1.2152,
1378
+ "mean_token_accuracy": 0.565186282992363,
1379
+ "num_tokens": 1769982.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 1.495800745487213,
1384
+ "epoch": 1.5,
1385
+ "grad_norm": 1.2632598876953125,
1386
+ "learning_rate": 9.729669469038216e-05,
1387
+ "loss": 1.1635,
1388
+ "mean_token_accuracy": 0.5871178984642029,
1389
+ "num_tokens": 1783102.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 1.535517191886902,
1394
+ "epoch": 1.5108695652173914,
1395
+ "grad_norm": 0.9593290090560913,
1396
+ "learning_rate": 9.723809376940544e-05,
1397
+ "loss": 1.2108,
1398
+ "mean_token_accuracy": 0.5709479689598084,
1399
+ "num_tokens": 1796398.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 1.529611337184906,
1404
+ "epoch": 1.5217391304347827,
1405
+ "grad_norm": 1.0819748640060425,
1406
+ "learning_rate": 9.717888251272477e-05,
1407
+ "loss": 1.1972,
1408
+ "mean_token_accuracy": 0.5633429378271103,
1409
+ "num_tokens": 1809379.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 1.5493282318115233,
1414
+ "epoch": 1.5326086956521738,
1415
+ "grad_norm": 0.9472999572753906,
1416
+ "learning_rate": 9.71190616853741e-05,
1417
+ "loss": 1.2616,
1418
+ "mean_token_accuracy": 0.5486618399620056,
1419
+ "num_tokens": 1822664.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 1.4989375710487365,
1424
+ "epoch": 1.5434782608695652,
1425
+ "grad_norm": 1.2883214950561523,
1426
+ "learning_rate": 9.705863206026321e-05,
1427
+ "loss": 1.2137,
1428
+ "mean_token_accuracy": 0.558601850271225,
1429
+ "num_tokens": 1835336.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 1.5061516761779785,
1434
+ "epoch": 1.5543478260869565,
1435
+ "grad_norm": 0.9577755928039551,
1436
+ "learning_rate": 9.699759441816787e-05,
1437
+ "loss": 1.1739,
1438
+ "mean_token_accuracy": 0.577557110786438,
1439
+ "num_tokens": 1847755.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 1.5141437649726868,
1444
+ "epoch": 1.5652173913043477,
1445
+ "grad_norm": 1.0751005411148071,
1446
+ "learning_rate": 9.693594954771965e-05,
1447
+ "loss": 1.231,
1448
+ "mean_token_accuracy": 0.5506497710943222,
1449
+ "num_tokens": 1860302.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 1.5419356107711792,
1454
+ "epoch": 1.5760869565217392,
1455
+ "grad_norm": 1.0141667127609253,
1456
+ "learning_rate": 9.687369824539577e-05,
1457
+ "loss": 1.2788,
1458
+ "mean_token_accuracy": 0.5303231775760651,
1459
+ "num_tokens": 1873093.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 1.520876133441925,
1464
+ "epoch": 1.5869565217391304,
1465
+ "grad_norm": 1.109215259552002,
1466
+ "learning_rate": 9.68108413155088e-05,
1467
+ "loss": 1.2333,
1468
+ "mean_token_accuracy": 0.5601014912128448,
1469
+ "num_tokens": 1886177.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 1.4981224894523621,
1474
+ "epoch": 1.5978260869565217,
1475
+ "grad_norm": 0.9200493097305298,
1476
+ "learning_rate": 9.674737957019624e-05,
1477
+ "loss": 1.1852,
1478
+ "mean_token_accuracy": 0.5700576066970825,
1479
+ "num_tokens": 1899113.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 1.5140800833702088,
1484
+ "epoch": 1.608695652173913,
1485
+ "grad_norm": 1.190007209777832,
1486
+ "learning_rate": 9.66833138294101e-05,
1487
+ "loss": 1.1929,
1488
+ "mean_token_accuracy": 0.5691904962062836,
1489
+ "num_tokens": 1912474.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 1.5299779295921325,
1494
+ "epoch": 1.6195652173913042,
1495
+ "grad_norm": 0.9787003397941589,
1496
+ "learning_rate": 9.661864492090625e-05,
1497
+ "loss": 1.2179,
1498
+ "mean_token_accuracy": 0.553766930103302,
1499
+ "num_tokens": 1925685.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 1.5431510925292968,
1504
+ "epoch": 1.6304347826086958,
1505
+ "grad_norm": 1.1734333038330078,
1506
+ "learning_rate": 9.655337368023371e-05,
1507
+ "loss": 1.2108,
1508
+ "mean_token_accuracy": 0.5539384454488754,
1509
+ "num_tokens": 1938610.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.5246233105659486,
1514
+ "epoch": 1.641304347826087,
1515
+ "grad_norm": 1.072691559791565,
1516
+ "learning_rate": 9.64875009507239e-05,
1517
+ "loss": 1.1999,
1518
+ "mean_token_accuracy": 0.5761029601097107,
1519
+ "num_tokens": 1951241.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.538881742954254,
1524
+ "epoch": 1.6521739130434783,
1525
+ "grad_norm": 1.0783456563949585,
1526
+ "learning_rate": 9.642102758347973e-05,
1527
+ "loss": 1.2443,
1528
+ "mean_token_accuracy": 0.5502734839916229,
1529
+ "num_tokens": 1964816.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.550068199634552,
1534
+ "epoch": 1.6630434782608696,
1535
+ "grad_norm": 1.0582056045532227,
1536
+ "learning_rate": 9.63539544373646e-05,
1537
+ "loss": 1.2182,
1538
+ "mean_token_accuracy": 0.5598388969898224,
1539
+ "num_tokens": 1977930.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.5344447016716003,
1544
+ "epoch": 1.6739130434782608,
1545
+ "grad_norm": 0.9788505434989929,
1546
+ "learning_rate": 9.628628237899126e-05,
1547
+ "loss": 1.1852,
1548
+ "mean_token_accuracy": 0.5595145970582962,
1549
+ "num_tokens": 1991032.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.5468374967575074,
1554
+ "epoch": 1.6847826086956523,
1555
+ "grad_norm": 1.0464048385620117,
1556
+ "learning_rate": 9.621801228271073e-05,
1557
+ "loss": 1.2175,
1558
+ "mean_token_accuracy": 0.5616866886615753,
1559
+ "num_tokens": 2004207.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.5422045588493347,
1564
+ "epoch": 1.6956521739130435,
1565
+ "grad_norm": 0.8307158946990967,
1566
+ "learning_rate": 9.614914503060083e-05,
1567
+ "loss": 1.2202,
1568
+ "mean_token_accuracy": 0.5515525698661804,
1569
+ "num_tokens": 2016969.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.5343055129051208,
1574
+ "epoch": 1.7065217391304348,
1575
+ "grad_norm": 1.198614239692688,
1576
+ "learning_rate": 9.607968151245498e-05,
1577
+ "loss": 1.1866,
1578
+ "mean_token_accuracy": 0.5771215856075287,
1579
+ "num_tokens": 2029750.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.5321205615997315,
1584
+ "epoch": 1.7173913043478262,
1585
+ "grad_norm": 0.9247676134109497,
1586
+ "learning_rate": 9.600962262577053e-05,
1587
+ "loss": 1.2205,
1588
+ "mean_token_accuracy": 0.5626431256532669,
1589
+ "num_tokens": 2043181.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.541359269618988,
1594
+ "epoch": 1.7282608695652173,
1595
+ "grad_norm": 1.0934436321258545,
1596
+ "learning_rate": 9.593896927573728e-05,
1597
+ "loss": 1.2397,
1598
+ "mean_token_accuracy": 0.5406488478183746,
1599
+ "num_tokens": 2056541.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.5094799280166626,
1604
+ "epoch": 1.7391304347826086,
1605
+ "grad_norm": 0.8803229928016663,
1606
+ "learning_rate": 9.586772237522573e-05,
1607
+ "loss": 1.2047,
1608
+ "mean_token_accuracy": 0.5659328937530518,
1609
+ "num_tokens": 2069752.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.4946965217590331,
1614
+ "epoch": 1.75,
1615
+ "grad_norm": 1.0182883739471436,
1616
+ "learning_rate": 9.579588284477526e-05,
1617
+ "loss": 1.1492,
1618
+ "mean_token_accuracy": 0.5829819083213806,
1619
+ "num_tokens": 2083119.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.483540380001068,
1624
+ "epoch": 1.7608695652173914,
1625
+ "grad_norm": 1.2263387441635132,
1626
+ "learning_rate": 9.572345161258235e-05,
1627
+ "loss": 1.1474,
1628
+ "mean_token_accuracy": 0.5895151972770691,
1629
+ "num_tokens": 2095862.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.5072382926940917,
1634
+ "epoch": 1.7717391304347827,
1635
+ "grad_norm": 0.8639858365058899,
1636
+ "learning_rate": 9.565042961448844e-05,
1637
+ "loss": 1.1997,
1638
+ "mean_token_accuracy": 0.5625985980033874,
1639
+ "num_tokens": 2108549.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.5286765098571777,
1644
+ "epoch": 1.7826086956521738,
1645
+ "grad_norm": 1.0652116537094116,
1646
+ "learning_rate": 9.557681779396797e-05,
1647
+ "loss": 1.2253,
1648
+ "mean_token_accuracy": 0.5569576025009155,
1649
+ "num_tokens": 2120871.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.5006824493408204,
1654
+ "epoch": 1.7934782608695652,
1655
+ "grad_norm": 0.949942946434021,
1656
+ "learning_rate": 9.550261710211608e-05,
1657
+ "loss": 1.1973,
1658
+ "mean_token_accuracy": 0.5634852379560471,
1659
+ "num_tokens": 2134097.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.5050195574760437,
1664
+ "epoch": 1.8043478260869565,
1665
+ "grad_norm": 1.016350507736206,
1666
+ "learning_rate": 9.542782849763637e-05,
1667
+ "loss": 1.1709,
1668
+ "mean_token_accuracy": 0.5780033886432647,
1669
+ "num_tokens": 2147811.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.5177413702011109,
1674
+ "epoch": 1.8152173913043477,
1675
+ "grad_norm": 1.264799952507019,
1676
+ "learning_rate": 9.535245294682857e-05,
1677
+ "loss": 1.2513,
1678
+ "mean_token_accuracy": 0.5521585702896118,
1679
+ "num_tokens": 2160506.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.5374733328819274,
1684
+ "epoch": 1.8260869565217392,
1685
+ "grad_norm": 1.1713751554489136,
1686
+ "learning_rate": 9.527649142357596e-05,
1687
+ "loss": 1.2708,
1688
+ "mean_token_accuracy": 0.5314113944768906,
1689
+ "num_tokens": 2173328.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.5083181142807007,
1694
+ "epoch": 1.8369565217391304,
1695
+ "grad_norm": 1.1553071737289429,
1696
+ "learning_rate": 9.519994490933279e-05,
1697
+ "loss": 1.206,
1698
+ "mean_token_accuracy": 0.5680734992027283,
1699
+ "num_tokens": 2186452.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.5291340470314025,
1704
+ "epoch": 1.8478260869565217,
1705
+ "grad_norm": 1.1443442106246948,
1706
+ "learning_rate": 9.51228143931117e-05,
1707
+ "loss": 1.2351,
1708
+ "mean_token_accuracy": 0.5539528131484985,
1709
+ "num_tokens": 2199594.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.5204999327659607,
1714
+ "epoch": 1.858695652173913,
1715
+ "grad_norm": 1.1584019660949707,
1716
+ "learning_rate": 9.504510087147088e-05,
1717
+ "loss": 1.2338,
1718
+ "mean_token_accuracy": 0.5519226849079132,
1719
+ "num_tokens": 2212135.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.5614403247833253,
1724
+ "epoch": 1.8695652173913042,
1725
+ "grad_norm": 1.0798224210739136,
1726
+ "learning_rate": 9.496680534850113e-05,
1727
+ "loss": 1.2534,
1728
+ "mean_token_accuracy": 0.5530328571796417,
1729
+ "num_tokens": 2225159.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.5276212096214294,
1734
+ "epoch": 1.8804347826086958,
1735
+ "grad_norm": 1.1296766996383667,
1736
+ "learning_rate": 9.488792883581299e-05,
1737
+ "loss": 1.1784,
1738
+ "mean_token_accuracy": 0.5774711936712265,
1739
+ "num_tokens": 2238139.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.544056522846222,
1744
+ "epoch": 1.891304347826087,
1745
+ "grad_norm": 1.1214172840118408,
1746
+ "learning_rate": 9.480847235252361e-05,
1747
+ "loss": 1.2268,
1748
+ "mean_token_accuracy": 0.5613886952400208,
1749
+ "num_tokens": 2250928.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 1.5295695900917052,
1754
+ "epoch": 1.9021739130434783,
1755
+ "grad_norm": 1.1650352478027344,
1756
+ "learning_rate": 9.472843692524363e-05,
1757
+ "loss": 1.1573,
1758
+ "mean_token_accuracy": 0.5787465155124665,
1759
+ "num_tokens": 2263338.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 1.5347764611244201,
1764
+ "epoch": 1.9130434782608696,
1765
+ "grad_norm": 1.0249896049499512,
1766
+ "learning_rate": 9.464782358806383e-05,
1767
+ "loss": 1.1731,
1768
+ "mean_token_accuracy": 0.5780636668205261,
1769
+ "num_tokens": 2276200.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 1.5715635061264037,
1774
+ "epoch": 1.9239130434782608,
1775
+ "grad_norm": 1.0768051147460938,
1776
+ "learning_rate": 9.45666333825419e-05,
1777
+ "loss": 1.2585,
1778
+ "mean_token_accuracy": 0.5452336609363556,
1779
+ "num_tokens": 2289088.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 1.5402274131774902,
1784
+ "epoch": 1.9347826086956523,
1785
+ "grad_norm": 1.0846654176712036,
1786
+ "learning_rate": 9.448486735768884e-05,
1787
+ "loss": 1.1918,
1788
+ "mean_token_accuracy": 0.5699589729309082,
1789
+ "num_tokens": 2302544.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 1.5048401594161986,
1794
+ "epoch": 1.9456521739130435,
1795
+ "grad_norm": 1.1533433198928833,
1796
+ "learning_rate": 9.440252656995551e-05,
1797
+ "loss": 1.1792,
1798
+ "mean_token_accuracy": 0.5685461640357972,
1799
+ "num_tokens": 2315473.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 1.5128441214561463,
1804
+ "epoch": 1.9565217391304348,
1805
+ "grad_norm": 1.2847894430160522,
1806
+ "learning_rate": 9.431961208321892e-05,
1807
+ "loss": 1.1566,
1808
+ "mean_token_accuracy": 0.5870453357696533,
1809
+ "num_tokens": 2329176.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 1.5362990856170655,
1814
+ "epoch": 1.9673913043478262,
1815
+ "grad_norm": 1.2497868537902832,
1816
+ "learning_rate": 9.423612496876855e-05,
1817
+ "loss": 1.1896,
1818
+ "mean_token_accuracy": 0.5719706892967225,
1819
+ "num_tokens": 2341591.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 1.5580734014511108,
1824
+ "epoch": 1.9782608695652173,
1825
+ "grad_norm": 1.1140056848526,
1826
+ "learning_rate": 9.415206630529241e-05,
1827
+ "loss": 1.2434,
1828
+ "mean_token_accuracy": 0.5461874425411224,
1829
+ "num_tokens": 2354577.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 1.5499179720878602,
1834
+ "epoch": 1.9891304347826086,
1835
+ "grad_norm": 1.0708650350570679,
1836
+ "learning_rate": 9.406743717886321e-05,
1837
+ "loss": 1.1635,
1838
+ "mean_token_accuracy": 0.5835445284843445,
1839
+ "num_tokens": 2366934.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 1.5282660722732544,
1844
+ "epoch": 2.0,
1845
+ "grad_norm": 0.9982873797416687,
1846
+ "learning_rate": 9.398223868292424e-05,
1847
+ "loss": 1.162,
1848
+ "mean_token_accuracy": 0.5795026063919068,
1849
+ "num_tokens": 2379988.0,
1850
+ "step": 1840
1851
+ },
1852
+ {
1853
+ "entropy": 1.5200519680976867,
1854
+ "epoch": 2.010869565217391,
1855
+ "grad_norm": 1.7385492324829102,
1856
+ "learning_rate": 9.389647191827533e-05,
1857
+ "loss": 1.1189,
1858
+ "mean_token_accuracy": 0.593557745218277,
1859
+ "num_tokens": 2393159.0,
1860
+ "step": 1850
1861
+ },
1862
+ {
1863
+ "entropy": 1.4853432416915893,
1864
+ "epoch": 2.0217391304347827,
1865
+ "grad_norm": 1.50752592086792,
1866
+ "learning_rate": 9.38101379930585e-05,
1867
+ "loss": 1.1213,
1868
+ "mean_token_accuracy": 0.5929610729217529,
1869
+ "num_tokens": 2405857.0,
1870
+ "step": 1860
1871
+ },
1872
+ {
1873
+ "entropy": 1.4830349206924438,
1874
+ "epoch": 2.032608695652174,
1875
+ "grad_norm": 1.863458275794983,
1876
+ "learning_rate": 9.372323802274379e-05,
1877
+ "loss": 1.054,
1878
+ "mean_token_accuracy": 0.6212123036384583,
1879
+ "num_tokens": 2418899.0,
1880
+ "step": 1870
1881
+ },
1882
+ {
1883
+ "entropy": 1.4546002388000487,
1884
+ "epoch": 2.0434782608695654,
1885
+ "grad_norm": 1.7944133281707764,
1886
+ "learning_rate": 9.363577313011473e-05,
1887
+ "loss": 1.1034,
1888
+ "mean_token_accuracy": 0.6010935366153717,
1889
+ "num_tokens": 2431992.0,
1890
+ "step": 1880
1891
+ },
1892
+ {
1893
+ "entropy": 1.4757711172103882,
1894
+ "epoch": 2.0543478260869565,
1895
+ "grad_norm": 2.174811363220215,
1896
+ "learning_rate": 9.354774444525391e-05,
1897
+ "loss": 1.1084,
1898
+ "mean_token_accuracy": 0.5978758096694946,
1899
+ "num_tokens": 2445070.0,
1900
+ "step": 1890
1901
+ },
1902
+ {
1903
+ "entropy": 1.4131018400192261,
1904
+ "epoch": 2.0652173913043477,
1905
+ "grad_norm": 1.7135498523712158,
1906
+ "learning_rate": 9.345915310552835e-05,
1907
+ "loss": 0.989,
1908
+ "mean_token_accuracy": 0.6438018441200256,
1909
+ "num_tokens": 2458209.0,
1910
+ "step": 1900
1911
+ },
1912
+ {
1913
+ "entropy": 1.433064377307892,
1914
+ "epoch": 2.0760869565217392,
1915
+ "grad_norm": 2.0136771202087402,
1916
+ "learning_rate": 9.337000025557476e-05,
1917
+ "loss": 1.0498,
1918
+ "mean_token_accuracy": 0.6281741559505463,
1919
+ "num_tokens": 2471524.0,
1920
+ "step": 1910
1921
+ },
1922
+ {
1923
+ "entropy": 1.4479591965675354,
1924
+ "epoch": 2.0869565217391304,
1925
+ "grad_norm": 1.974313735961914,
1926
+ "learning_rate": 9.328028704728486e-05,
1927
+ "loss": 1.1358,
1928
+ "mean_token_accuracy": 0.5909003794193268,
1929
+ "num_tokens": 2484390.0,
1930
+ "step": 1920
1931
+ },
1932
+ {
1933
+ "entropy": 1.4301373600959777,
1934
+ "epoch": 2.097826086956522,
1935
+ "grad_norm": 2.091122627258301,
1936
+ "learning_rate": 9.319001463979036e-05,
1937
+ "loss": 1.0644,
1938
+ "mean_token_accuracy": 0.6180503129959106,
1939
+ "num_tokens": 2497381.0,
1940
+ "step": 1930
1941
+ },
1942
+ {
1943
+ "entropy": 1.447307538986206,
1944
+ "epoch": 2.108695652173913,
1945
+ "grad_norm": 2.072110414505005,
1946
+ "learning_rate": 9.309918419944812e-05,
1947
+ "loss": 1.0516,
1948
+ "mean_token_accuracy": 0.6275238871574402,
1949
+ "num_tokens": 2510088.0,
1950
+ "step": 1940
1951
+ },
1952
+ {
1953
+ "entropy": 1.4422586917877198,
1954
+ "epoch": 2.119565217391304,
1955
+ "grad_norm": 2.243779182434082,
1956
+ "learning_rate": 9.300779689982498e-05,
1957
+ "loss": 1.0335,
1958
+ "mean_token_accuracy": 0.6258177101612091,
1959
+ "num_tokens": 2522503.0,
1960
+ "step": 1950
1961
+ },
1962
+ {
1963
+ "entropy": 1.4682541370391846,
1964
+ "epoch": 2.130434782608696,
1965
+ "grad_norm": 1.9335227012634277,
1966
+ "learning_rate": 9.291585392168262e-05,
1967
+ "loss": 1.1021,
1968
+ "mean_token_accuracy": 0.5970285713672638,
1969
+ "num_tokens": 2534900.0,
1970
+ "step": 1960
1971
+ },
1972
+ {
1973
+ "entropy": 1.4592154502868653,
1974
+ "epoch": 2.141304347826087,
1975
+ "grad_norm": 1.8887137174606323,
1976
+ "learning_rate": 9.282335645296236e-05,
1977
+ "loss": 1.1029,
1978
+ "mean_token_accuracy": 0.6081736207008361,
1979
+ "num_tokens": 2547520.0,
1980
+ "step": 1970
1981
+ },
1982
+ {
1983
+ "entropy": 1.4663932919502258,
1984
+ "epoch": 2.1521739130434785,
1985
+ "grad_norm": 1.85846745967865,
1986
+ "learning_rate": 9.273030568876972e-05,
1987
+ "loss": 1.1397,
1988
+ "mean_token_accuracy": 0.5843419551849365,
1989
+ "num_tokens": 2561091.0,
1990
+ "step": 1980
1991
+ },
1992
+ {
1993
+ "entropy": 1.4661445379257203,
1994
+ "epoch": 2.1630434782608696,
1995
+ "grad_norm": 1.8708477020263672,
1996
+ "learning_rate": 9.263670283135908e-05,
1997
+ "loss": 1.0669,
1998
+ "mean_token_accuracy": 0.615378075838089,
1999
+ "num_tokens": 2574368.0,
2000
+ "step": 1990
2001
+ },
2002
+ {
2003
+ "entropy": 1.452696180343628,
2004
+ "epoch": 2.1739130434782608,
2005
+ "grad_norm": 2.0021235942840576,
2006
+ "learning_rate": 9.254254909011804e-05,
2007
+ "loss": 1.1013,
2008
+ "mean_token_accuracy": 0.6071189284324646,
2009
+ "num_tokens": 2587038.0,
2010
+ "step": 2000
2011
+ },
2012
+ {
2013
+ "entropy": 1.476554262638092,
2014
+ "epoch": 2.1847826086956523,
2015
+ "grad_norm": 1.845048427581787,
2016
+ "learning_rate": 9.244784568155186e-05,
2017
+ "loss": 1.1526,
2018
+ "mean_token_accuracy": 0.5891152262687683,
2019
+ "num_tokens": 2599681.0,
2020
+ "step": 2010
2021
+ },
2022
+ {
2023
+ "entropy": 1.456975257396698,
2024
+ "epoch": 2.1956521739130435,
2025
+ "grad_norm": 1.8318718671798706,
2026
+ "learning_rate": 9.235259382926775e-05,
2027
+ "loss": 1.0836,
2028
+ "mean_token_accuracy": 0.6162681877613068,
2029
+ "num_tokens": 2612413.0,
2030
+ "step": 2020
2031
+ },
2032
+ {
2033
+ "entropy": 1.4466612815856934,
2034
+ "epoch": 2.2065217391304346,
2035
+ "grad_norm": 1.732918381690979,
2036
+ "learning_rate": 9.225679476395904e-05,
2037
+ "loss": 1.0735,
2038
+ "mean_token_accuracy": 0.6188689291477203,
2039
+ "num_tokens": 2624887.0,
2040
+ "step": 2030
2041
+ },
2042
+ {
2043
+ "entropy": 1.4582988500595093,
2044
+ "epoch": 2.217391304347826,
2045
+ "grad_norm": 1.8891679048538208,
2046
+ "learning_rate": 9.216044972338924e-05,
2047
+ "loss": 1.1246,
2048
+ "mean_token_accuracy": 0.5922350704669952,
2049
+ "num_tokens": 2637800.0,
2050
+ "step": 2040
2051
+ },
2052
+ {
2053
+ "entropy": 1.423206055164337,
2054
+ "epoch": 2.2282608695652173,
2055
+ "grad_norm": 2.4237935543060303,
2056
+ "learning_rate": 9.206355995237614e-05,
2057
+ "loss": 0.997,
2058
+ "mean_token_accuracy": 0.6368430316448211,
2059
+ "num_tokens": 2650494.0,
2060
+ "step": 2050
2061
+ },
2062
+ {
2063
+ "entropy": 1.4386169552803039,
2064
+ "epoch": 2.239130434782609,
2065
+ "grad_norm": 1.8828299045562744,
2066
+ "learning_rate": 9.196612670277561e-05,
2067
+ "loss": 1.1133,
2068
+ "mean_token_accuracy": 0.6033352434635162,
2069
+ "num_tokens": 2663664.0,
2070
+ "step": 2060
2071
+ },
2072
+ {
2073
+ "entropy": 1.47161967754364,
2074
+ "epoch": 2.25,
2075
+ "grad_norm": 2.1654105186462402,
2076
+ "learning_rate": 9.186815123346555e-05,
2077
+ "loss": 1.1308,
2078
+ "mean_token_accuracy": 0.5900616765022277,
2079
+ "num_tokens": 2676431.0,
2080
+ "step": 2070
2081
+ },
2082
+ {
2083
+ "entropy": 1.4096957564353942,
2084
+ "epoch": 2.260869565217391,
2085
+ "grad_norm": 2.163583278656006,
2086
+ "learning_rate": 9.176963481032951e-05,
2087
+ "loss": 1.0434,
2088
+ "mean_token_accuracy": 0.624973613023758,
2089
+ "num_tokens": 2688969.0,
2090
+ "step": 2080
2091
+ },
2092
+ {
2093
+ "entropy": 1.4344560146331786,
2094
+ "epoch": 2.2717391304347827,
2095
+ "grad_norm": 2.001441478729248,
2096
+ "learning_rate": 9.167057870624045e-05,
2097
+ "loss": 1.0799,
2098
+ "mean_token_accuracy": 0.6116856634616852,
2099
+ "num_tokens": 2701940.0,
2100
+ "step": 2090
2101
+ },
2102
+ {
2103
+ "entropy": 1.4405582904815675,
2104
+ "epoch": 2.282608695652174,
2105
+ "grad_norm": 2.0407516956329346,
2106
+ "learning_rate": 9.157098420104416e-05,
2107
+ "loss": 1.1088,
2108
+ "mean_token_accuracy": 0.5969192087650299,
2109
+ "num_tokens": 2714710.0,
2110
+ "step": 2100
2111
+ },
2112
+ {
2113
+ "entropy": 1.4413450241088868,
2114
+ "epoch": 2.2934782608695654,
2115
+ "grad_norm": 2.175715684890747,
2116
+ "learning_rate": 9.147085258154284e-05,
2117
+ "loss": 1.1133,
2118
+ "mean_token_accuracy": 0.597865492105484,
2119
+ "num_tokens": 2728122.0,
2120
+ "step": 2110
2121
+ },
2122
+ {
2123
+ "entropy": 1.4492259979248048,
2124
+ "epoch": 2.3043478260869565,
2125
+ "grad_norm": 1.756759762763977,
2126
+ "learning_rate": 9.137018514147842e-05,
2127
+ "loss": 1.139,
2128
+ "mean_token_accuracy": 0.5876732349395752,
2129
+ "num_tokens": 2741386.0,
2130
+ "step": 2120
2131
+ },
2132
+ {
2133
+ "entropy": 1.4231749057769776,
2134
+ "epoch": 2.3152173913043477,
2135
+ "grad_norm": 2.2174477577209473,
2136
+ "learning_rate": 9.126898318151585e-05,
2137
+ "loss": 1.0647,
2138
+ "mean_token_accuracy": 0.6096932172775269,
2139
+ "num_tokens": 2754105.0,
2140
+ "step": 2130
2141
+ },
2142
+ {
2143
+ "entropy": 1.4355740666389465,
2144
+ "epoch": 2.3260869565217392,
2145
+ "grad_norm": 1.9028793573379517,
2146
+ "learning_rate": 9.116724800922629e-05,
2147
+ "loss": 1.1054,
2148
+ "mean_token_accuracy": 0.6013290226459503,
2149
+ "num_tokens": 2767282.0,
2150
+ "step": 2140
2151
+ },
2152
+ {
2153
+ "entropy": 1.4280303597450257,
2154
+ "epoch": 2.3369565217391304,
2155
+ "grad_norm": 2.2789151668548584,
2156
+ "learning_rate": 9.106498093907024e-05,
2157
+ "loss": 1.0761,
2158
+ "mean_token_accuracy": 0.6177151262760162,
2159
+ "num_tokens": 2779590.0,
2160
+ "step": 2150
2161
+ },
2162
+ {
2163
+ "entropy": 1.4350167870521546,
2164
+ "epoch": 2.3478260869565215,
2165
+ "grad_norm": 1.9398648738861084,
2166
+ "learning_rate": 9.096218329238053e-05,
2167
+ "loss": 1.1292,
2168
+ "mean_token_accuracy": 0.5900428295135498,
2169
+ "num_tokens": 2792004.0,
2170
+ "step": 2160
2171
+ },
2172
+ {
2173
+ "entropy": 1.4391902089118958,
2174
+ "epoch": 2.358695652173913,
2175
+ "grad_norm": 2.298678159713745,
2176
+ "learning_rate": 9.085885639734527e-05,
2177
+ "loss": 1.0603,
2178
+ "mean_token_accuracy": 0.6203874349594116,
2179
+ "num_tokens": 2804852.0,
2180
+ "step": 2170
2181
+ },
2182
+ {
2183
+ "entropy": 1.4266687989234925,
2184
+ "epoch": 2.369565217391304,
2185
+ "grad_norm": 1.8723516464233398,
2186
+ "learning_rate": 9.075500158899067e-05,
2187
+ "loss": 1.0439,
2188
+ "mean_token_accuracy": 0.6204161286354065,
2189
+ "num_tokens": 2818240.0,
2190
+ "step": 2180
2191
+ },
2192
+ {
2193
+ "entropy": 1.447180449962616,
2194
+ "epoch": 2.380434782608696,
2195
+ "grad_norm": 2.6517324447631836,
2196
+ "learning_rate": 9.065062020916377e-05,
2197
+ "loss": 1.0897,
2198
+ "mean_token_accuracy": 0.6031298160552978,
2199
+ "num_tokens": 2831531.0,
2200
+ "step": 2190
2201
+ },
2202
+ {
2203
+ "entropy": 1.45684095621109,
2204
+ "epoch": 2.391304347826087,
2205
+ "grad_norm": 2.0795247554779053,
2206
+ "learning_rate": 9.054571360651517e-05,
2207
+ "loss": 1.0772,
2208
+ "mean_token_accuracy": 0.6159474074840545,
2209
+ "num_tokens": 2844349.0,
2210
+ "step": 2200
2211
+ },
2212
+ {
2213
+ "entropy": 1.4636580228805542,
2214
+ "epoch": 2.4021739130434785,
2215
+ "grad_norm": 2.3823142051696777,
2216
+ "learning_rate": 9.044028313648157e-05,
2217
+ "loss": 1.0985,
2218
+ "mean_token_accuracy": 0.600275206565857,
2219
+ "num_tokens": 2857149.0,
2220
+ "step": 2210
2221
+ },
2222
+ {
2223
+ "entropy": 1.452285599708557,
2224
+ "epoch": 2.4130434782608696,
2225
+ "grad_norm": 2.1175410747528076,
2226
+ "learning_rate": 9.033433016126822e-05,
2227
+ "loss": 1.1088,
2228
+ "mean_token_accuracy": 0.6099293529987335,
2229
+ "num_tokens": 2869994.0,
2230
+ "step": 2220
2231
+ },
2232
+ {
2233
+ "entropy": 1.4636402249336242,
2234
+ "epoch": 2.4239130434782608,
2235
+ "grad_norm": 2.156698226928711,
2236
+ "learning_rate": 9.022785604983139e-05,
2237
+ "loss": 1.0813,
2238
+ "mean_token_accuracy": 0.6157242119312286,
2239
+ "num_tokens": 2882741.0,
2240
+ "step": 2230
2241
+ },
2242
+ {
2243
+ "entropy": 1.448258113861084,
2244
+ "epoch": 2.4347826086956523,
2245
+ "grad_norm": 2.1612956523895264,
2246
+ "learning_rate": 9.01208621778606e-05,
2247
+ "loss": 1.1146,
2248
+ "mean_token_accuracy": 0.5982438385486603,
2249
+ "num_tokens": 2895598.0,
2250
+ "step": 2240
2251
+ },
2252
+ {
2253
+ "entropy": 1.4485355496406556,
2254
+ "epoch": 2.4456521739130435,
2255
+ "grad_norm": 2.342386245727539,
2256
+ "learning_rate": 9.001334992776094e-05,
2257
+ "loss": 1.1075,
2258
+ "mean_token_accuracy": 0.59824697971344,
2259
+ "num_tokens": 2908649.0,
2260
+ "step": 2250
2261
+ },
2262
+ {
2263
+ "entropy": 1.4362555623054505,
2264
+ "epoch": 2.4565217391304346,
2265
+ "grad_norm": 2.226168394088745,
2266
+ "learning_rate": 8.990532068863513e-05,
2267
+ "loss": 1.0506,
2268
+ "mean_token_accuracy": 0.6281331360340119,
2269
+ "num_tokens": 2921983.0,
2270
+ "step": 2260
2271
+ },
2272
+ {
2273
+ "entropy": 1.453425133228302,
2274
+ "epoch": 2.467391304347826,
2275
+ "grad_norm": 2.0690982341766357,
2276
+ "learning_rate": 8.979677585626559e-05,
2277
+ "loss": 1.1294,
2278
+ "mean_token_accuracy": 0.5930320501327515,
2279
+ "num_tokens": 2934939.0,
2280
+ "step": 2270
2281
+ },
2282
+ {
2283
+ "entropy": 1.4780887007713317,
2284
+ "epoch": 2.4782608695652173,
2285
+ "grad_norm": 1.5956579446792603,
2286
+ "learning_rate": 8.968771683309645e-05,
2287
+ "loss": 1.1802,
2288
+ "mean_token_accuracy": 0.5728381037712097,
2289
+ "num_tokens": 2947635.0,
2290
+ "step": 2280
2291
+ },
2292
+ {
2293
+ "entropy": 1.4748708367347718,
2294
+ "epoch": 2.489130434782609,
2295
+ "grad_norm": 2.1426117420196533,
2296
+ "learning_rate": 8.95781450282154e-05,
2297
+ "loss": 1.1558,
2298
+ "mean_token_accuracy": 0.591809231042862,
2299
+ "num_tokens": 2960233.0,
2300
+ "step": 2290
2301
+ },
2302
+ {
2303
+ "entropy": 1.4672940373420715,
2304
+ "epoch": 2.5,
2305
+ "grad_norm": 2.2189557552337646,
2306
+ "learning_rate": 8.946806185733543e-05,
2307
+ "loss": 1.0865,
2308
+ "mean_token_accuracy": 0.6044426262378693,
2309
+ "num_tokens": 2973559.0,
2310
+ "step": 2300
2311
+ },
2312
+ {
2313
+ "entropy": 1.4749647736549378,
2314
+ "epoch": 2.5108695652173916,
2315
+ "grad_norm": 2.0216891765594482,
2316
+ "learning_rate": 8.935746874277667e-05,
2317
+ "loss": 1.1216,
2318
+ "mean_token_accuracy": 0.5929350137710572,
2319
+ "num_tokens": 2986671.0,
2320
+ "step": 2310
2321
+ },
2322
+ {
2323
+ "entropy": 1.468804383277893,
2324
+ "epoch": 2.5217391304347827,
2325
+ "grad_norm": 1.746561050415039,
2326
+ "learning_rate": 8.924636711344784e-05,
2327
+ "loss": 1.1529,
2328
+ "mean_token_accuracy": 0.5794779539108277,
2329
+ "num_tokens": 2999818.0,
2330
+ "step": 2320
2331
+ },
2332
+ {
2333
+ "entropy": 1.4579639315605164,
2334
+ "epoch": 2.532608695652174,
2335
+ "grad_norm": 2.7989909648895264,
2336
+ "learning_rate": 8.913475840482797e-05,
2337
+ "loss": 1.1037,
2338
+ "mean_token_accuracy": 0.5982204794883728,
2339
+ "num_tokens": 3012539.0,
2340
+ "step": 2330
2341
+ },
2342
+ {
2343
+ "entropy": 1.4399624347686768,
2344
+ "epoch": 2.5434782608695654,
2345
+ "grad_norm": 2.2155885696411133,
2346
+ "learning_rate": 8.902264405894771e-05,
2347
+ "loss": 1.0702,
2348
+ "mean_token_accuracy": 0.6180161297321319,
2349
+ "num_tokens": 3025100.0,
2350
+ "step": 2340
2351
+ },
2352
+ {
2353
+ "entropy": 1.4719431400299072,
2354
+ "epoch": 2.5543478260869565,
2355
+ "grad_norm": 2.0845561027526855,
2356
+ "learning_rate": 8.891002552437076e-05,
2357
+ "loss": 1.1735,
2358
+ "mean_token_accuracy": 0.5835448384284974,
2359
+ "num_tokens": 3037684.0,
2360
+ "step": 2350
2361
+ },
2362
+ {
2363
+ "entropy": 1.4472892165184021,
2364
+ "epoch": 2.5652173913043477,
2365
+ "grad_norm": 1.958483099937439,
2366
+ "learning_rate": 8.879690425617517e-05,
2367
+ "loss": 1.0761,
2368
+ "mean_token_accuracy": 0.607106763124466,
2369
+ "num_tokens": 3050626.0,
2370
+ "step": 2360
2371
+ },
2372
+ {
2373
+ "entropy": 1.4262180566787719,
2374
+ "epoch": 2.5760869565217392,
2375
+ "grad_norm": 2.5125794410705566,
2376
+ "learning_rate": 8.868328171593448e-05,
2377
+ "loss": 1.0234,
2378
+ "mean_token_accuracy": 0.6303494095802307,
2379
+ "num_tokens": 3063145.0,
2380
+ "step": 2370
2381
+ },
2382
+ {
2383
+ "entropy": 1.4762691497802733,
2384
+ "epoch": 2.5869565217391304,
2385
+ "grad_norm": 1.9473073482513428,
2386
+ "learning_rate": 8.85691593716989e-05,
2387
+ "loss": 1.1387,
2388
+ "mean_token_accuracy": 0.5892048954963685,
2389
+ "num_tokens": 3076300.0,
2390
+ "step": 2380
2391
+ },
2392
+ {
2393
+ "entropy": 1.4989805698394776,
2394
+ "epoch": 2.5978260869565215,
2395
+ "grad_norm": 2.1876816749572754,
2396
+ "learning_rate": 8.845453869797631e-05,
2397
+ "loss": 1.1517,
2398
+ "mean_token_accuracy": 0.5851214408874512,
2399
+ "num_tokens": 3089439.0,
2400
+ "step": 2390
2401
+ },
2402
+ {
2403
+ "entropy": 1.4469679355621339,
2404
+ "epoch": 2.608695652173913,
2405
+ "grad_norm": 2.1269173622131348,
2406
+ "learning_rate": 8.833942117571321e-05,
2407
+ "loss": 1.0865,
2408
+ "mean_token_accuracy": 0.604921555519104,
2409
+ "num_tokens": 3102252.0,
2410
+ "step": 2400
2411
+ },
2412
+ {
2413
+ "entropy": 1.4462681651115417,
2414
+ "epoch": 2.619565217391304,
2415
+ "grad_norm": 2.108311176300049,
2416
+ "learning_rate": 8.822380829227559e-05,
2417
+ "loss": 1.0939,
2418
+ "mean_token_accuracy": 0.6013546764850617,
2419
+ "num_tokens": 3115257.0,
2420
+ "step": 2410
2421
+ },
2422
+ {
2423
+ "entropy": 1.452495265007019,
2424
+ "epoch": 2.630434782608696,
2425
+ "grad_norm": 1.948119878768921,
2426
+ "learning_rate": 8.810770154142969e-05,
2427
+ "loss": 1.1095,
2428
+ "mean_token_accuracy": 0.6048475563526153,
2429
+ "num_tokens": 3128271.0,
2430
+ "step": 2420
2431
+ },
2432
+ {
2433
+ "entropy": 1.4279615521430968,
2434
+ "epoch": 2.641304347826087,
2435
+ "grad_norm": 2.3602614402770996,
2436
+ "learning_rate": 8.799110242332276e-05,
2437
+ "loss": 1.0708,
2438
+ "mean_token_accuracy": 0.6152625262737275,
2439
+ "num_tokens": 3140987.0,
2440
+ "step": 2430
2441
+ },
2442
+ {
2443
+ "entropy": 1.445945417881012,
2444
+ "epoch": 2.6521739130434785,
2445
+ "grad_norm": 1.9659548997879028,
2446
+ "learning_rate": 8.787401244446361e-05,
2447
+ "loss": 1.1057,
2448
+ "mean_token_accuracy": 0.6044125318527221,
2449
+ "num_tokens": 3154073.0,
2450
+ "step": 2440
2451
+ },
2452
+ {
2453
+ "entropy": 1.4435201168060303,
2454
+ "epoch": 2.6630434782608696,
2455
+ "grad_norm": 2.0865726470947266,
2456
+ "learning_rate": 8.775643311770318e-05,
2457
+ "loss": 1.0508,
2458
+ "mean_token_accuracy": 0.6200963973999023,
2459
+ "num_tokens": 3166774.0,
2460
+ "step": 2450
2461
+ },
2462
+ {
2463
+ "entropy": 1.4427612900733948,
2464
+ "epoch": 2.6739130434782608,
2465
+ "grad_norm": 1.95523202419281,
2466
+ "learning_rate": 8.7638365962215e-05,
2467
+ "loss": 1.0604,
2468
+ "mean_token_accuracy": 0.6158404231071473,
2469
+ "num_tokens": 3180239.0,
2470
+ "step": 2460
2471
+ },
2472
+ {
2473
+ "entropy": 1.480810034275055,
2474
+ "epoch": 2.6847826086956523,
2475
+ "grad_norm": 1.7022428512573242,
2476
+ "learning_rate": 8.751981250347552e-05,
2477
+ "loss": 1.1435,
2478
+ "mean_token_accuracy": 0.5830896198749542,
2479
+ "num_tokens": 3193085.0,
2480
+ "step": 2470
2481
+ },
2482
+ {
2483
+ "entropy": 1.4741726160049438,
2484
+ "epoch": 2.6956521739130435,
2485
+ "grad_norm": 1.9417518377304077,
2486
+ "learning_rate": 8.740077427324446e-05,
2487
+ "loss": 1.1434,
2488
+ "mean_token_accuracy": 0.5773356080055236,
2489
+ "num_tokens": 3206199.0,
2490
+ "step": 2480
2491
+ },
2492
+ {
2493
+ "entropy": 1.4773874044418336,
2494
+ "epoch": 2.7065217391304346,
2495
+ "grad_norm": 1.9512629508972168,
2496
+ "learning_rate": 8.728125280954498e-05,
2497
+ "loss": 1.1219,
2498
+ "mean_token_accuracy": 0.5898585200309754,
2499
+ "num_tokens": 3219296.0,
2500
+ "step": 2490
2501
+ },
2502
+ {
2503
+ "entropy": 1.4541948914527894,
2504
+ "epoch": 2.717391304347826,
2505
+ "grad_norm": 2.068798542022705,
2506
+ "learning_rate": 8.716124965664379e-05,
2507
+ "loss": 1.1123,
2508
+ "mean_token_accuracy": 0.6047747969627381,
2509
+ "num_tokens": 3232042.0,
2510
+ "step": 2500
2511
+ },
2512
+ {
2513
+ "entropy": 1.4830847024917602,
2514
+ "epoch": 2.7282608695652173,
2515
+ "grad_norm": 1.9898854494094849,
2516
+ "learning_rate": 8.704076636503128e-05,
2517
+ "loss": 1.1671,
2518
+ "mean_token_accuracy": 0.5747832953929901,
2519
+ "num_tokens": 3244815.0,
2520
+ "step": 2510
2521
+ },
2522
+ {
2523
+ "entropy": 1.4612651348114014,
2524
+ "epoch": 2.7391304347826084,
2525
+ "grad_norm": 2.2067410945892334,
2526
+ "learning_rate": 8.691980449140135e-05,
2527
+ "loss": 1.0978,
2528
+ "mean_token_accuracy": 0.5956085741519928,
2529
+ "num_tokens": 3257668.0,
2530
+ "step": 2520
2531
+ },
2532
+ {
2533
+ "entropy": 1.462200677394867,
2534
+ "epoch": 2.75,
2535
+ "grad_norm": 2.014509439468384,
2536
+ "learning_rate": 8.679836559863148e-05,
2537
+ "loss": 1.1169,
2538
+ "mean_token_accuracy": 0.5984118282794952,
2539
+ "num_tokens": 3270813.0,
2540
+ "step": 2530
2541
+ },
2542
+ {
2543
+ "entropy": 1.4501710295677186,
2544
+ "epoch": 2.7608695652173916,
2545
+ "grad_norm": 2.5156266689300537,
2546
+ "learning_rate": 8.667645125576235e-05,
2547
+ "loss": 1.1319,
2548
+ "mean_token_accuracy": 0.5989238739013671,
2549
+ "num_tokens": 3283624.0,
2550
+ "step": 2540
2551
+ },
2552
+ {
2553
+ "entropy": 1.4544371724128724,
2554
+ "epoch": 2.7717391304347827,
2555
+ "grad_norm": 1.9277708530426025,
2556
+ "learning_rate": 8.655406303797767e-05,
2557
+ "loss": 1.0811,
2558
+ "mean_token_accuracy": 0.6054942727088928,
2559
+ "num_tokens": 3296764.0,
2560
+ "step": 2550
2561
+ },
2562
+ {
2563
+ "entropy": 1.462872362136841,
2564
+ "epoch": 2.782608695652174,
2565
+ "grad_norm": 2.244176149368286,
2566
+ "learning_rate": 8.643120252658381e-05,
2567
+ "loss": 1.1231,
2568
+ "mean_token_accuracy": 0.5996806621551514,
2569
+ "num_tokens": 3310476.0,
2570
+ "step": 2560
2571
+ },
2572
+ {
2573
+ "entropy": 1.4650163173675537,
2574
+ "epoch": 2.7934782608695654,
2575
+ "grad_norm": 2.060368537902832,
2576
+ "learning_rate": 8.630787130898943e-05,
2577
+ "loss": 1.1183,
2578
+ "mean_token_accuracy": 0.5905598402023315,
2579
+ "num_tokens": 3323159.0,
2580
+ "step": 2570
2581
+ },
2582
+ {
2583
+ "entropy": 1.4386309385299683,
2584
+ "epoch": 2.8043478260869565,
2585
+ "grad_norm": 1.8049969673156738,
2586
+ "learning_rate": 8.618407097868482e-05,
2587
+ "loss": 1.0914,
2588
+ "mean_token_accuracy": 0.6009307503700256,
2589
+ "num_tokens": 3336124.0,
2590
+ "step": 2580
2591
+ },
2592
+ {
2593
+ "entropy": 1.4660000324249267,
2594
+ "epoch": 2.8152173913043477,
2595
+ "grad_norm": 1.997644066810608,
2596
+ "learning_rate": 8.605980313522142e-05,
2597
+ "loss": 1.1499,
2598
+ "mean_token_accuracy": 0.5847253501415253,
2599
+ "num_tokens": 3349224.0,
2600
+ "step": 2590
2601
+ },
2602
+ {
2603
+ "entropy": 1.432177233695984,
2604
+ "epoch": 2.8260869565217392,
2605
+ "grad_norm": 2.895296096801758,
2606
+ "learning_rate": 8.59350693841912e-05,
2607
+ "loss": 1.0535,
2608
+ "mean_token_accuracy": 0.622643232345581,
2609
+ "num_tokens": 3362020.0,
2610
+ "step": 2600
2611
+ },
2612
+ {
2613
+ "entropy": 1.4307915687561035,
2614
+ "epoch": 2.8369565217391304,
2615
+ "grad_norm": 2.170982599258423,
2616
+ "learning_rate": 8.580987133720576e-05,
2617
+ "loss": 1.0787,
2618
+ "mean_token_accuracy": 0.6147644519805908,
2619
+ "num_tokens": 3374950.0,
2620
+ "step": 2610
2621
+ },
2622
+ {
2623
+ "entropy": 1.4522701263427735,
2624
+ "epoch": 2.8478260869565215,
2625
+ "grad_norm": 2.1109468936920166,
2626
+ "learning_rate": 8.568421061187567e-05,
2627
+ "loss": 1.0683,
2628
+ "mean_token_accuracy": 0.6159465253353119,
2629
+ "num_tokens": 3387887.0,
2630
+ "step": 2620
2631
+ },
2632
+ {
2633
+ "entropy": 1.435029399394989,
2634
+ "epoch": 2.858695652173913,
2635
+ "grad_norm": 2.271132469177246,
2636
+ "learning_rate": 8.55580888317894e-05,
2637
+ "loss": 1.0815,
2638
+ "mean_token_accuracy": 0.6143145263195038,
2639
+ "num_tokens": 3401084.0,
2640
+ "step": 2630
2641
+ },
2642
+ {
2643
+ "entropy": 1.4437922477722167,
2644
+ "epoch": 2.869565217391304,
2645
+ "grad_norm": 2.1083803176879883,
2646
+ "learning_rate": 8.543150762649257e-05,
2647
+ "loss": 1.1018,
2648
+ "mean_token_accuracy": 0.6052383124828339,
2649
+ "num_tokens": 3414604.0,
2650
+ "step": 2640
2651
+ },
2652
+ {
2653
+ "entropy": 1.4524288296699523,
2654
+ "epoch": 2.880434782608696,
2655
+ "grad_norm": 2.2077994346618652,
2656
+ "learning_rate": 8.530446863146664e-05,
2657
+ "loss": 1.1257,
2658
+ "mean_token_accuracy": 0.5953928649425506,
2659
+ "num_tokens": 3427383.0,
2660
+ "step": 2650
2661
+ },
2662
+ {
2663
+ "entropy": 1.4274506211280822,
2664
+ "epoch": 2.891304347826087,
2665
+ "grad_norm": 2.1553750038146973,
2666
+ "learning_rate": 8.517697348810798e-05,
2667
+ "loss": 1.1036,
2668
+ "mean_token_accuracy": 0.6045750975608826,
2669
+ "num_tokens": 3440363.0,
2670
+ "step": 2660
2671
+ },
2672
+ {
2673
+ "entropy": 1.4272564888000487,
2674
+ "epoch": 2.9021739130434785,
2675
+ "grad_norm": 1.6765693426132202,
2676
+ "learning_rate": 8.504902384370657e-05,
2677
+ "loss": 1.0936,
2678
+ "mean_token_accuracy": 0.6030111908912659,
2679
+ "num_tokens": 3453079.0,
2680
+ "step": 2670
2681
+ },
2682
+ {
2683
+ "entropy": 1.4609074950218202,
2684
+ "epoch": 2.9130434782608696,
2685
+ "grad_norm": 2.0705795288085938,
2686
+ "learning_rate": 8.492062135142469e-05,
2687
+ "loss": 1.1471,
2688
+ "mean_token_accuracy": 0.5836342215538025,
2689
+ "num_tokens": 3466067.0,
2690
+ "step": 2680
2691
+ },
2692
+ {
2693
+ "entropy": 1.4428972482681275,
2694
+ "epoch": 2.9239130434782608,
2695
+ "grad_norm": 1.660536289215088,
2696
+ "learning_rate": 8.479176767027566e-05,
2697
+ "loss": 1.1191,
2698
+ "mean_token_accuracy": 0.5946624577045441,
2699
+ "num_tokens": 3479321.0,
2700
+ "step": 2690
2701
+ },
2702
+ {
2703
+ "entropy": 1.4401050209999084,
2704
+ "epoch": 2.9347826086956523,
2705
+ "grad_norm": 2.135024070739746,
2706
+ "learning_rate": 8.466246446510231e-05,
2707
+ "loss": 1.1123,
2708
+ "mean_token_accuracy": 0.5984612464904785,
2709
+ "num_tokens": 3492053.0,
2710
+ "step": 2700
2711
+ },
2712
+ {
2713
+ "entropy": 1.4631260633468628,
2714
+ "epoch": 2.9456521739130435,
2715
+ "grad_norm": 1.980193853378296,
2716
+ "learning_rate": 8.45327134065555e-05,
2717
+ "loss": 1.1476,
2718
+ "mean_token_accuracy": 0.5893894731998444,
2719
+ "num_tokens": 3505044.0,
2720
+ "step": 2710
2721
+ },
2722
+ {
2723
+ "entropy": 1.4601631045341492,
2724
+ "epoch": 2.9565217391304346,
2725
+ "grad_norm": 1.9377002716064453,
2726
+ "learning_rate": 8.44025161710726e-05,
2727
+ "loss": 1.0807,
2728
+ "mean_token_accuracy": 0.6130505263805389,
2729
+ "num_tokens": 3518122.0,
2730
+ "step": 2720
2731
+ },
2732
+ {
2733
+ "entropy": 1.4349589943885803,
2734
+ "epoch": 2.967391304347826,
2735
+ "grad_norm": 2.004295587539673,
2736
+ "learning_rate": 8.42718744408557e-05,
2737
+ "loss": 1.0994,
2738
+ "mean_token_accuracy": 0.6051357507705688,
2739
+ "num_tokens": 3530855.0,
2740
+ "step": 2730
2741
+ },
2742
+ {
2743
+ "entropy": 1.4589566707611084,
2744
+ "epoch": 2.9782608695652173,
2745
+ "grad_norm": 2.228742837905884,
2746
+ "learning_rate": 8.414078990384995e-05,
2747
+ "loss": 1.1294,
2748
+ "mean_token_accuracy": 0.5972096085548401,
2749
+ "num_tokens": 3543926.0,
2750
+ "step": 2740
2751
+ },
2752
+ {
2753
+ "entropy": 1.4834345698356628,
2754
+ "epoch": 2.9891304347826084,
2755
+ "grad_norm": 1.964888095855713,
2756
+ "learning_rate": 8.400926425372182e-05,
2757
+ "loss": 1.182,
2758
+ "mean_token_accuracy": 0.582332044839859,
2759
+ "num_tokens": 3556590.0,
2760
+ "step": 2750
2761
+ },
2762
+ {
2763
+ "entropy": 1.4434187054634093,
2764
+ "epoch": 3.0,
2765
+ "grad_norm": 2.4529945850372314,
2766
+ "learning_rate": 8.387729918983706e-05,
2767
+ "loss": 1.0934,
2768
+ "mean_token_accuracy": 0.6053484171628952,
2769
+ "num_tokens": 3569982.0,
2770
+ "step": 2760
2771
+ }
2772
+ ],
2773
+ "logging_steps": 10,
2774
+ "max_steps": 9200,
2775
+ "num_input_tokens_seen": 0,
2776
+ "num_train_epochs": 10,
2777
+ "save_steps": 500,
2778
+ "stateful_callbacks": {
2779
+ "TrainerControl": {
2780
+ "args": {
2781
+ "should_epoch_stop": false,
2782
+ "should_evaluate": false,
2783
+ "should_log": false,
2784
+ "should_save": true,
2785
+ "should_training_stop": false
2786
+ },
2787
+ "attributes": {}
2788
+ }
2789
+ },
2790
+ "total_flos": 1.518842606712238e+17,
2791
+ "train_batch_size": 8,
2792
+ "trial_name": null,
2793
+ "trial_params": null
2794
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-2760/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39fc106588aab87c0c75f9a3096b4f5a4f8b5d70b6ee9265174e6306e3ca2d67
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d307793ac8defecd3c83909e3edd67ba0adff5dab9d19e8ababe22ba1e871ad
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-3680/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj",
39
+ "k_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4662618e7ac602191b7ecb54df44c2644a9b346fb4c281593eaa7b19b6eaa8
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a6_L22_broadcast_noSys/checkpoint-4600/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896