agu18dec commited on
Commit
1d6ec55
·
verified ·
1 Parent(s): e77c492

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/README.md +61 -0
  3. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_config.json +48 -0
  4. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_model.safetensors +3 -0
  5. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/added_tokens.json +24 -0
  6. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/chat_template.jinja +54 -0
  7. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/README.md +209 -0
  8. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_config.json +48 -0
  9. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_model.safetensors +3 -0
  10. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/added_tokens.json +24 -0
  11. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/chat_template.jinja +54 -0
  12. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/merges.txt +0 -0
  13. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/special_tokens_map.json +31 -0
  14. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json +3 -0
  15. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer_config.json +207 -0
  16. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/trainer_state.json +1774 -0
  17. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/training_args.bin +3 -0
  18. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/vocab.json +0 -0
  19. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/README.md +209 -0
  20. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_config.json +48 -0
  21. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_model.safetensors +3 -0
  22. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/added_tokens.json +24 -0
  23. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/chat_template.jinja +54 -0
  24. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/merges.txt +0 -0
  25. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/special_tokens_map.json +31 -0
  26. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json +3 -0
  27. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer_config.json +207 -0
  28. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/trainer_state.json +2644 -0
  29. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/training_args.bin +3 -0
  30. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/vocab.json +0 -0
  31. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/README.md +209 -0
  32. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_config.json +48 -0
  33. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_model.safetensors +3 -0
  34. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/added_tokens.json +24 -0
  35. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/chat_template.jinja +54 -0
  36. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/merges.txt +0 -0
  37. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/special_tokens_map.json +31 -0
  38. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json +3 -0
  39. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer_config.json +207 -0
  40. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/trainer_state.json +0 -0
  41. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/training_args.bin +3 -0
  42. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/vocab.json +0 -0
  43. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/README.md +209 -0
  44. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_config.json +48 -0
  45. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_model.safetensors +3 -0
  46. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/added_tokens.json +24 -0
  47. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/chat_template.jinja +54 -0
  48. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/merges.txt +0 -0
  49. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/special_tokens_map.json +31 -0
  50. checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -600,3 +600,14 @@ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noS
600
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-8144/tokenizer.json filter=lfs diff=lfs merge=lfs -text
601
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-9162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
602
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
600
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-8144/tokenizer.json filter=lfs diff=lfs merge=lfs -text
601
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/checkpoint-9162/tokenizer.json filter=lfs diff=lfs merge=lfs -text
602
  checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.5_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
603
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
604
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json filter=lfs diff=lfs merge=lfs -text
605
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json filter=lfs diff=lfs merge=lfs -text
606
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json filter=lfs diff=lfs merge=lfs -text
607
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-5232/tokenizer.json filter=lfs diff=lfs merge=lfs -text
608
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-6104/tokenizer.json filter=lfs diff=lfs merge=lfs -text
609
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-6976/tokenizer.json filter=lfs diff=lfs merge=lfs -text
610
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-7848/tokenizer.json filter=lfs diff=lfs merge=lfs -text
611
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-872/tokenizer.json filter=lfs diff=lfs merge=lfs -text
612
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-8720/tokenizer.json filter=lfs diff=lfs merge=lfs -text
613
+ checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/7w4qne7u)
34
+
35
+
36
+ This model was trained with SFT.
37
+
38
+ ### Framework versions
39
+
40
+ - PEFT 0.19.1
41
+ - TRL: 0.28.0
42
+ - Transformers: 4.57.6
43
+ - Pytorch: 2.9.1
44
+ - Datasets: 4.5.0
45
+ - Tokenizers: 0.22.2
46
+
47
+ ## Citations
48
+
49
+
50
+
51
+ Cite TRL as:
52
+
53
+ ```bibtex
54
+ @software{vonwerra2020trl,
55
+ title = {{TRL: Transformers Reinforcement Learning}},
56
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
57
+ license = {Apache-2.0},
58
+ url = {https://github.com/huggingface/trl},
59
+ year = {2020}
60
+ }
61
+ ```
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "gate_proj",
39
+ "q_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4497725543943b3ffc62917072d81017d4a0be55b8f2c8ef0ebfd55b9aeb2831
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "gate_proj",
39
+ "q_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4825512ba858d36680597e1a9abb6e2a74e6725df30d73a3b77a5f4d369ebe7f
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/trainer_state.json ADDED
@@ -0,0 +1,1774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1744,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2426587224006653,
14
+ "epoch": 0.011467889908256881,
15
+ "grad_norm": 3.028918981552124,
16
+ "learning_rate": 2.0642201834862385e-06,
17
+ "loss": 0.6546,
18
+ "mean_token_accuracy": 0.7775660812854767,
19
+ "num_tokens": 14294.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2369988679885864,
24
+ "epoch": 0.022935779816513763,
25
+ "grad_norm": 2.726299524307251,
26
+ "learning_rate": 4.357798165137615e-06,
27
+ "loss": 0.6567,
28
+ "mean_token_accuracy": 0.7740493595600129,
29
+ "num_tokens": 28037.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2392263770103455,
34
+ "epoch": 0.034403669724770644,
35
+ "grad_norm": 2.2062387466430664,
36
+ "learning_rate": 6.651376146788992e-06,
37
+ "loss": 0.6411,
38
+ "mean_token_accuracy": 0.7753027558326722,
39
+ "num_tokens": 42268.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.2435187816619873,
44
+ "epoch": 0.045871559633027525,
45
+ "grad_norm": 2.4188315868377686,
46
+ "learning_rate": 8.944954128440369e-06,
47
+ "loss": 0.6271,
48
+ "mean_token_accuracy": 0.7827982664108276,
49
+ "num_tokens": 56474.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.261301326751709,
54
+ "epoch": 0.05733944954128441,
55
+ "grad_norm": 2.490365982055664,
56
+ "learning_rate": 1.1238532110091744e-05,
57
+ "loss": 0.6343,
58
+ "mean_token_accuracy": 0.7792274355888367,
59
+ "num_tokens": 70348.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.2830254554748535,
64
+ "epoch": 0.06880733944954129,
65
+ "grad_norm": 2.6820173263549805,
66
+ "learning_rate": 1.3532110091743119e-05,
67
+ "loss": 0.6298,
68
+ "mean_token_accuracy": 0.7811978995800019,
69
+ "num_tokens": 84857.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.275150680541992,
74
+ "epoch": 0.08027522935779817,
75
+ "grad_norm": 1.8796888589859009,
76
+ "learning_rate": 1.5825688073394497e-05,
77
+ "loss": 0.6689,
78
+ "mean_token_accuracy": 0.7587406218051911,
79
+ "num_tokens": 99396.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.2276832342147828,
84
+ "epoch": 0.09174311926605505,
85
+ "grad_norm": 1.690596580505371,
86
+ "learning_rate": 1.811926605504587e-05,
87
+ "loss": 0.5586,
88
+ "mean_token_accuracy": 0.8031339049339294,
89
+ "num_tokens": 112619.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.2304488182067872,
94
+ "epoch": 0.10321100917431193,
95
+ "grad_norm": 1.7719478607177734,
96
+ "learning_rate": 2.0412844036697248e-05,
97
+ "loss": 0.6093,
98
+ "mean_token_accuracy": 0.7849110841751099,
99
+ "num_tokens": 126496.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.2812824487686156,
104
+ "epoch": 0.11467889908256881,
105
+ "grad_norm": 1.5947109460830688,
106
+ "learning_rate": 2.2706422018348624e-05,
107
+ "loss": 0.6455,
108
+ "mean_token_accuracy": 0.7670063555240632,
109
+ "num_tokens": 140628.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.2834580063819885,
114
+ "epoch": 0.12614678899082568,
115
+ "grad_norm": 1.8809813261032104,
116
+ "learning_rate": 2.5e-05,
117
+ "loss": 0.6489,
118
+ "mean_token_accuracy": 0.7716078400611878,
119
+ "num_tokens": 153959.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.2611944794654846,
124
+ "epoch": 0.13761467889908258,
125
+ "grad_norm": 1.780765414237976,
126
+ "learning_rate": 2.7293577981651375e-05,
127
+ "loss": 0.5445,
128
+ "mean_token_accuracy": 0.8067175269126892,
129
+ "num_tokens": 168207.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.272983717918396,
134
+ "epoch": 0.14908256880733944,
135
+ "grad_norm": 2.119795560836792,
136
+ "learning_rate": 2.9587155963302755e-05,
137
+ "loss": 0.6021,
138
+ "mean_token_accuracy": 0.7827390134334564,
139
+ "num_tokens": 182042.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.2775203227996825,
144
+ "epoch": 0.16055045871559634,
145
+ "grad_norm": 1.4455509185791016,
146
+ "learning_rate": 3.188073394495413e-05,
147
+ "loss": 0.5715,
148
+ "mean_token_accuracy": 0.7992757976055145,
149
+ "num_tokens": 196015.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.2987091898918153,
154
+ "epoch": 0.1720183486238532,
155
+ "grad_norm": 1.4980850219726562,
156
+ "learning_rate": 3.4174311926605505e-05,
157
+ "loss": 0.6023,
158
+ "mean_token_accuracy": 0.7867661654949188,
159
+ "num_tokens": 210215.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.2755884647369384,
164
+ "epoch": 0.1834862385321101,
165
+ "grad_norm": 1.8361093997955322,
166
+ "learning_rate": 3.646788990825688e-05,
167
+ "loss": 0.6042,
168
+ "mean_token_accuracy": 0.7830365121364593,
169
+ "num_tokens": 224017.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.2698514223098756,
174
+ "epoch": 0.19495412844036697,
175
+ "grad_norm": 1.3812596797943115,
176
+ "learning_rate": 3.876146788990826e-05,
177
+ "loss": 0.5846,
178
+ "mean_token_accuracy": 0.7849388122558594,
179
+ "num_tokens": 237482.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.244412088394165,
184
+ "epoch": 0.20642201834862386,
185
+ "grad_norm": 1.798601746559143,
186
+ "learning_rate": 4.1055045871559636e-05,
187
+ "loss": 0.547,
188
+ "mean_token_accuracy": 0.8001754701137542,
189
+ "num_tokens": 251169.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.2745447516441346,
194
+ "epoch": 0.21788990825688073,
195
+ "grad_norm": 1.9844188690185547,
196
+ "learning_rate": 4.334862385321101e-05,
197
+ "loss": 0.5623,
198
+ "mean_token_accuracy": 0.7987869799137115,
199
+ "num_tokens": 264764.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.3000339031219483,
204
+ "epoch": 0.22935779816513763,
205
+ "grad_norm": 1.4861323833465576,
206
+ "learning_rate": 4.564220183486239e-05,
207
+ "loss": 0.6123,
208
+ "mean_token_accuracy": 0.7852487206459046,
209
+ "num_tokens": 278504.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.3115605235099792,
214
+ "epoch": 0.2408256880733945,
215
+ "grad_norm": 1.8251842260360718,
216
+ "learning_rate": 4.7935779816513766e-05,
217
+ "loss": 0.658,
218
+ "mean_token_accuracy": 0.759744155406952,
219
+ "num_tokens": 293021.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.3224670886993408,
224
+ "epoch": 0.25229357798165136,
225
+ "grad_norm": 1.5895862579345703,
226
+ "learning_rate": 5.022935779816514e-05,
227
+ "loss": 0.6213,
228
+ "mean_token_accuracy": 0.7748652398586273,
229
+ "num_tokens": 306811.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.2993175029754638,
234
+ "epoch": 0.26376146788990823,
235
+ "grad_norm": 1.6444000005722046,
236
+ "learning_rate": 5.252293577981652e-05,
237
+ "loss": 0.5723,
238
+ "mean_token_accuracy": 0.7920855104923248,
239
+ "num_tokens": 320873.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.3019076824188232,
244
+ "epoch": 0.27522935779816515,
245
+ "grad_norm": 1.5319479703903198,
246
+ "learning_rate": 5.481651376146789e-05,
247
+ "loss": 0.5945,
248
+ "mean_token_accuracy": 0.7845574736595153,
249
+ "num_tokens": 335323.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.289823544025421,
254
+ "epoch": 0.286697247706422,
255
+ "grad_norm": 1.2900819778442383,
256
+ "learning_rate": 5.7110091743119266e-05,
257
+ "loss": 0.5692,
258
+ "mean_token_accuracy": 0.7950143396854401,
259
+ "num_tokens": 349575.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.2885443449020386,
264
+ "epoch": 0.2981651376146789,
265
+ "grad_norm": 1.408677577972412,
266
+ "learning_rate": 5.940366972477065e-05,
267
+ "loss": 0.6104,
268
+ "mean_token_accuracy": 0.7816850125789643,
269
+ "num_tokens": 363961.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.3065945267677308,
274
+ "epoch": 0.30963302752293576,
275
+ "grad_norm": 1.3809661865234375,
276
+ "learning_rate": 6.169724770642203e-05,
277
+ "loss": 0.6426,
278
+ "mean_token_accuracy": 0.7661891877651215,
279
+ "num_tokens": 377555.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.2889371395111084,
284
+ "epoch": 0.3211009174311927,
285
+ "grad_norm": 1.4974966049194336,
286
+ "learning_rate": 6.39908256880734e-05,
287
+ "loss": 0.5882,
288
+ "mean_token_accuracy": 0.7819968700408936,
289
+ "num_tokens": 391423.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.2952162742614746,
294
+ "epoch": 0.33256880733944955,
295
+ "grad_norm": 1.3621913194656372,
296
+ "learning_rate": 6.628440366972477e-05,
297
+ "loss": 0.57,
298
+ "mean_token_accuracy": 0.7946897804737091,
299
+ "num_tokens": 405650.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.2888988494873046,
304
+ "epoch": 0.3440366972477064,
305
+ "grad_norm": 1.793961524963379,
306
+ "learning_rate": 6.857798165137616e-05,
307
+ "loss": 0.6273,
308
+ "mean_token_accuracy": 0.7732390701770783,
309
+ "num_tokens": 419332.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.289334809780121,
314
+ "epoch": 0.3555045871559633,
315
+ "grad_norm": 1.5518903732299805,
316
+ "learning_rate": 7.087155963302753e-05,
317
+ "loss": 0.6492,
318
+ "mean_token_accuracy": 0.757664144039154,
319
+ "num_tokens": 433432.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.2899688124656676,
324
+ "epoch": 0.3669724770642202,
325
+ "grad_norm": 1.5826157331466675,
326
+ "learning_rate": 7.31651376146789e-05,
327
+ "loss": 0.5805,
328
+ "mean_token_accuracy": 0.7921296834945679,
329
+ "num_tokens": 447592.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.2863509058952332,
334
+ "epoch": 0.37844036697247707,
335
+ "grad_norm": 1.7210900783538818,
336
+ "learning_rate": 7.545871559633027e-05,
337
+ "loss": 0.5926,
338
+ "mean_token_accuracy": 0.7852405548095703,
339
+ "num_tokens": 462489.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.2890722513198853,
344
+ "epoch": 0.38990825688073394,
345
+ "grad_norm": 1.6051267385482788,
346
+ "learning_rate": 7.775229357798165e-05,
347
+ "loss": 0.6173,
348
+ "mean_token_accuracy": 0.7741429924964904,
349
+ "num_tokens": 476591.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.313070333003998,
354
+ "epoch": 0.4013761467889908,
355
+ "grad_norm": 1.7080140113830566,
356
+ "learning_rate": 8.004587155963303e-05,
357
+ "loss": 0.6165,
358
+ "mean_token_accuracy": 0.7842044055461883,
359
+ "num_tokens": 491338.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.2972561955451964,
364
+ "epoch": 0.41284403669724773,
365
+ "grad_norm": 1.7454527616500854,
366
+ "learning_rate": 8.23394495412844e-05,
367
+ "loss": 0.5927,
368
+ "mean_token_accuracy": 0.7841840922832489,
369
+ "num_tokens": 505152.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.2944241881370544,
374
+ "epoch": 0.4243119266055046,
375
+ "grad_norm": 1.8223613500595093,
376
+ "learning_rate": 8.463302752293578e-05,
377
+ "loss": 0.5862,
378
+ "mean_token_accuracy": 0.7846642255783081,
379
+ "num_tokens": 519536.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.2918418169021606,
384
+ "epoch": 0.43577981651376146,
385
+ "grad_norm": 1.323716640472412,
386
+ "learning_rate": 8.692660550458716e-05,
387
+ "loss": 0.5761,
388
+ "mean_token_accuracy": 0.788896131515503,
389
+ "num_tokens": 533610.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.3106001019477844,
394
+ "epoch": 0.44724770642201833,
395
+ "grad_norm": 2.1389827728271484,
396
+ "learning_rate": 8.922018348623854e-05,
397
+ "loss": 0.6442,
398
+ "mean_token_accuracy": 0.7677759766578675,
399
+ "num_tokens": 547213.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.2924273014068604,
404
+ "epoch": 0.45871559633027525,
405
+ "grad_norm": 1.3077127933502197,
406
+ "learning_rate": 9.151376146788991e-05,
407
+ "loss": 0.6044,
408
+ "mean_token_accuracy": 0.7855095267295837,
409
+ "num_tokens": 560707.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.3057442545890807,
414
+ "epoch": 0.4701834862385321,
415
+ "grad_norm": 1.658679723739624,
416
+ "learning_rate": 9.380733944954129e-05,
417
+ "loss": 0.5803,
418
+ "mean_token_accuracy": 0.7926251292228699,
419
+ "num_tokens": 574533.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.3044120788574218,
424
+ "epoch": 0.481651376146789,
425
+ "grad_norm": 1.7965151071548462,
426
+ "learning_rate": 9.610091743119267e-05,
427
+ "loss": 0.5984,
428
+ "mean_token_accuracy": 0.7874112606048584,
429
+ "num_tokens": 587931.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.3121570587158202,
434
+ "epoch": 0.49311926605504586,
435
+ "grad_norm": 1.1833796501159668,
436
+ "learning_rate": 9.839449541284404e-05,
437
+ "loss": 0.6231,
438
+ "mean_token_accuracy": 0.7761680126190186,
439
+ "num_tokens": 602080.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.3229384422302246,
444
+ "epoch": 0.5045871559633027,
445
+ "grad_norm": 1.98506760597229,
446
+ "learning_rate": 9.99999676404826e-05,
447
+ "loss": 0.6223,
448
+ "mean_token_accuracy": 0.774652361869812,
449
+ "num_tokens": 615535.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.2842121720314026,
454
+ "epoch": 0.5160550458715596,
455
+ "grad_norm": 1.8412768840789795,
456
+ "learning_rate": 9.999939236133826e-05,
457
+ "loss": 0.5968,
458
+ "mean_token_accuracy": 0.7840604305267334,
459
+ "num_tokens": 628767.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.3064908266067505,
464
+ "epoch": 0.5275229357798165,
465
+ "grad_norm": 1.7538436651229858,
466
+ "learning_rate": 9.999809799133033e-05,
467
+ "loss": 0.6244,
468
+ "mean_token_accuracy": 0.7701604008674622,
469
+ "num_tokens": 642874.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.3011385202407837,
474
+ "epoch": 0.5389908256880734,
475
+ "grad_norm": 2.0401413440704346,
476
+ "learning_rate": 9.99960845490744e-05,
477
+ "loss": 0.5897,
478
+ "mean_token_accuracy": 0.7876223146915435,
479
+ "num_tokens": 656374.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.3175038933753966,
484
+ "epoch": 0.5504587155963303,
485
+ "grad_norm": 1.5815656185150146,
486
+ "learning_rate": 9.999335206352783e-05,
487
+ "loss": 0.6681,
488
+ "mean_token_accuracy": 0.7586038947105408,
489
+ "num_tokens": 670397.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.3054586052894592,
494
+ "epoch": 0.5619266055045872,
495
+ "grad_norm": 1.7010897397994995,
496
+ "learning_rate": 9.998990057398916e-05,
497
+ "loss": 0.6488,
498
+ "mean_token_accuracy": 0.7646380603313446,
499
+ "num_tokens": 684143.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.2969472885131836,
504
+ "epoch": 0.573394495412844,
505
+ "grad_norm": 2.1294353008270264,
506
+ "learning_rate": 9.998573013009771e-05,
507
+ "loss": 0.6505,
508
+ "mean_token_accuracy": 0.7664439141750335,
509
+ "num_tokens": 697427.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.3074483752250672,
514
+ "epoch": 0.5848623853211009,
515
+ "grad_norm": 2.1683812141418457,
516
+ "learning_rate": 9.998084079183276e-05,
517
+ "loss": 0.5897,
518
+ "mean_token_accuracy": 0.7885696291923523,
519
+ "num_tokens": 711947.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.2956400752067565,
524
+ "epoch": 0.5963302752293578,
525
+ "grad_norm": 1.4167346954345703,
526
+ "learning_rate": 9.997523262951274e-05,
527
+ "loss": 0.6388,
528
+ "mean_token_accuracy": 0.7672183573246002,
529
+ "num_tokens": 726268.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.315368902683258,
534
+ "epoch": 0.6077981651376146,
535
+ "grad_norm": 2.1706671714782715,
536
+ "learning_rate": 9.996890572379418e-05,
537
+ "loss": 0.6844,
538
+ "mean_token_accuracy": 0.7582804381847381,
539
+ "num_tokens": 740230.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.2926068663597108,
544
+ "epoch": 0.6192660550458715,
545
+ "grad_norm": 1.6460140943527222,
546
+ "learning_rate": 9.99618601656706e-05,
547
+ "loss": 0.5693,
548
+ "mean_token_accuracy": 0.795549190044403,
549
+ "num_tokens": 754570.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.2848342299461364,
554
+ "epoch": 0.6307339449541285,
555
+ "grad_norm": 1.7705565690994263,
556
+ "learning_rate": 9.995409605647117e-05,
557
+ "loss": 0.6189,
558
+ "mean_token_accuracy": 0.7828136622905731,
559
+ "num_tokens": 768740.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.3091715574264526,
564
+ "epoch": 0.6422018348623854,
565
+ "grad_norm": 1.7903367280960083,
566
+ "learning_rate": 9.994561350785923e-05,
567
+ "loss": 0.6096,
568
+ "mean_token_accuracy": 0.7809465050697326,
569
+ "num_tokens": 782860.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.3097781181335448,
574
+ "epoch": 0.6536697247706422,
575
+ "grad_norm": 1.6261135339736938,
576
+ "learning_rate": 9.993641264183074e-05,
577
+ "loss": 0.6488,
578
+ "mean_token_accuracy": 0.7686248242855072,
579
+ "num_tokens": 796852.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.2892103433609008,
584
+ "epoch": 0.6651376146788991,
585
+ "grad_norm": 1.530013084411621,
586
+ "learning_rate": 9.992649359071247e-05,
587
+ "loss": 0.6099,
588
+ "mean_token_accuracy": 0.7832099735736847,
589
+ "num_tokens": 810833.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.2781771540641784,
594
+ "epoch": 0.676605504587156,
595
+ "grad_norm": 1.3513305187225342,
596
+ "learning_rate": 9.991585649716014e-05,
597
+ "loss": 0.6059,
598
+ "mean_token_accuracy": 0.7849724233150482,
599
+ "num_tokens": 825129.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.289398467540741,
604
+ "epoch": 0.6880733944954128,
605
+ "grad_norm": 1.2714006900787354,
606
+ "learning_rate": 9.990450151415636e-05,
607
+ "loss": 0.6262,
608
+ "mean_token_accuracy": 0.7734242856502533,
609
+ "num_tokens": 839084.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.3282314896583558,
614
+ "epoch": 0.6995412844036697,
615
+ "grad_norm": 1.6062265634536743,
616
+ "learning_rate": 9.989242880500837e-05,
617
+ "loss": 0.6804,
618
+ "mean_token_accuracy": 0.7598551273345947,
619
+ "num_tokens": 853275.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.279460871219635,
624
+ "epoch": 0.7110091743119266,
625
+ "grad_norm": 1.211531400680542,
626
+ "learning_rate": 9.987963854334581e-05,
627
+ "loss": 0.5422,
628
+ "mean_token_accuracy": 0.8087258577346802,
629
+ "num_tokens": 867001.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.3079694390296936,
634
+ "epoch": 0.7224770642201835,
635
+ "grad_norm": 1.9886008501052856,
636
+ "learning_rate": 9.986613091311811e-05,
637
+ "loss": 0.6505,
638
+ "mean_token_accuracy": 0.7643534898757934,
639
+ "num_tokens": 880836.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.3083110094070434,
644
+ "epoch": 0.7339449541284404,
645
+ "grad_norm": 1.7378991842269897,
646
+ "learning_rate": 9.98519061085919e-05,
647
+ "loss": 0.6507,
648
+ "mean_token_accuracy": 0.7652741134166717,
649
+ "num_tokens": 894456.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.3111968874931335,
654
+ "epoch": 0.7454128440366973,
655
+ "grad_norm": 1.6157206296920776,
656
+ "learning_rate": 9.983696433434821e-05,
657
+ "loss": 0.6009,
658
+ "mean_token_accuracy": 0.7828308165073394,
659
+ "num_tokens": 908581.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.3001808285713197,
664
+ "epoch": 0.7568807339449541,
665
+ "grad_norm": 1.7530412673950195,
666
+ "learning_rate": 9.982130580527951e-05,
667
+ "loss": 0.5973,
668
+ "mean_token_accuracy": 0.7872715950012207,
669
+ "num_tokens": 922198.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.3001506924629211,
674
+ "epoch": 0.768348623853211,
675
+ "grad_norm": 1.8743090629577637,
676
+ "learning_rate": 9.980493074658665e-05,
677
+ "loss": 0.5991,
678
+ "mean_token_accuracy": 0.7848590850830078,
679
+ "num_tokens": 934965.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.3329032421112061,
684
+ "epoch": 0.7798165137614679,
685
+ "grad_norm": 1.646851658821106,
686
+ "learning_rate": 9.978783939377558e-05,
687
+ "loss": 0.646,
688
+ "mean_token_accuracy": 0.76202232837677,
689
+ "num_tokens": 949474.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.3042344450950623,
694
+ "epoch": 0.7912844036697247,
695
+ "grad_norm": 1.6828117370605469,
696
+ "learning_rate": 9.9770031992654e-05,
697
+ "loss": 0.5663,
698
+ "mean_token_accuracy": 0.7932763636112213,
699
+ "num_tokens": 963414.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.3154001832008362,
704
+ "epoch": 0.8027522935779816,
705
+ "grad_norm": 1.8354583978652954,
706
+ "learning_rate": 9.975150879932784e-05,
707
+ "loss": 0.5994,
708
+ "mean_token_accuracy": 0.7792726159095764,
709
+ "num_tokens": 977203.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.307938539981842,
714
+ "epoch": 0.8142201834862385,
715
+ "grad_norm": 1.6509039402008057,
716
+ "learning_rate": 9.97322700801975e-05,
717
+ "loss": 0.5663,
718
+ "mean_token_accuracy": 0.7955432832241058,
719
+ "num_tokens": 990943.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.3173952937126159,
724
+ "epoch": 0.8256880733944955,
725
+ "grad_norm": 1.8522167205810547,
726
+ "learning_rate": 9.971231611195407e-05,
727
+ "loss": 0.614,
728
+ "mean_token_accuracy": 0.7815097570419312,
729
+ "num_tokens": 1005001.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.340037202835083,
734
+ "epoch": 0.8371559633027523,
735
+ "grad_norm": 1.4919304847717285,
736
+ "learning_rate": 9.969164718157538e-05,
737
+ "loss": 0.6348,
738
+ "mean_token_accuracy": 0.7702794313430786,
739
+ "num_tokens": 1018544.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.3305164098739624,
744
+ "epoch": 0.8486238532110092,
745
+ "grad_norm": 1.5445469617843628,
746
+ "learning_rate": 9.967026358632184e-05,
747
+ "loss": 0.6136,
748
+ "mean_token_accuracy": 0.77325798869133,
749
+ "num_tokens": 1032665.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 1.3210863590240478,
754
+ "epoch": 0.8600917431192661,
755
+ "grad_norm": 1.9453340768814087,
756
+ "learning_rate": 9.964816563373212e-05,
757
+ "loss": 0.6514,
758
+ "mean_token_accuracy": 0.7692999839782715,
759
+ "num_tokens": 1047328.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.327096664905548,
764
+ "epoch": 0.8715596330275229,
765
+ "grad_norm": 1.8478624820709229,
766
+ "learning_rate": 9.962535364161879e-05,
767
+ "loss": 0.6003,
768
+ "mean_token_accuracy": 0.7799559772014618,
769
+ "num_tokens": 1061305.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.3272370457649232,
774
+ "epoch": 0.8830275229357798,
775
+ "grad_norm": 1.9946807622909546,
776
+ "learning_rate": 9.960182793806377e-05,
777
+ "loss": 0.6315,
778
+ "mean_token_accuracy": 0.7699635088443756,
779
+ "num_tokens": 1075123.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.3235833764076232,
784
+ "epoch": 0.8944954128440367,
785
+ "grad_norm": 1.500209927558899,
786
+ "learning_rate": 9.957758886141351e-05,
787
+ "loss": 0.6527,
788
+ "mean_token_accuracy": 0.7683537185192109,
789
+ "num_tokens": 1089084.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.312354290485382,
794
+ "epoch": 0.9059633027522935,
795
+ "grad_norm": 1.6548733711242676,
796
+ "learning_rate": 9.955263676027427e-05,
797
+ "loss": 0.5927,
798
+ "mean_token_accuracy": 0.7949600100517273,
799
+ "num_tokens": 1103963.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.3421159029006957,
804
+ "epoch": 0.9174311926605505,
805
+ "grad_norm": 1.5262596607208252,
806
+ "learning_rate": 9.95269719935069e-05,
807
+ "loss": 0.6553,
808
+ "mean_token_accuracy": 0.7679201364517212,
809
+ "num_tokens": 1117901.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.344819176197052,
814
+ "epoch": 0.9288990825688074,
815
+ "grad_norm": 1.42953360080719,
816
+ "learning_rate": 9.950059493022193e-05,
817
+ "loss": 0.6607,
818
+ "mean_token_accuracy": 0.762078708410263,
819
+ "num_tokens": 1132174.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.3429975152015685,
824
+ "epoch": 0.9403669724770642,
825
+ "grad_norm": 1.648417592048645,
826
+ "learning_rate": 9.947350594977402e-05,
827
+ "loss": 0.6929,
828
+ "mean_token_accuracy": 0.7437104344367981,
829
+ "num_tokens": 1146769.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.3269536972045899,
834
+ "epoch": 0.9518348623853211,
835
+ "grad_norm": 1.802235722541809,
836
+ "learning_rate": 9.944570544175673e-05,
837
+ "loss": 0.6676,
838
+ "mean_token_accuracy": 0.7601192831993103,
839
+ "num_tokens": 1161091.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.3191216468811036,
844
+ "epoch": 0.963302752293578,
845
+ "grad_norm": 1.9612555503845215,
846
+ "learning_rate": 9.941719380599672e-05,
847
+ "loss": 0.625,
848
+ "mean_token_accuracy": 0.7729354560375213,
849
+ "num_tokens": 1173905.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.3115869045257569,
854
+ "epoch": 0.9747706422018348,
855
+ "grad_norm": 1.2845028638839722,
856
+ "learning_rate": 9.93879714525481e-05,
857
+ "loss": 0.5944,
858
+ "mean_token_accuracy": 0.7839926242828369,
859
+ "num_tokens": 1188063.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.3091205954551697,
864
+ "epoch": 0.9862385321100917,
865
+ "grad_norm": 1.8383289575576782,
866
+ "learning_rate": 9.935803880168652e-05,
867
+ "loss": 0.6237,
868
+ "mean_token_accuracy": 0.7753754138946534,
869
+ "num_tokens": 1202695.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 1.2994250178337097,
874
+ "epoch": 0.9977064220183486,
875
+ "grad_norm": 1.571912407875061,
876
+ "learning_rate": 9.932739628390316e-05,
877
+ "loss": 0.6456,
878
+ "mean_token_accuracy": 0.7671150684356689,
879
+ "num_tokens": 1216684.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.3076510548591613,
884
+ "epoch": 1.0091743119266054,
885
+ "grad_norm": 1.8406661748886108,
886
+ "learning_rate": 9.929604433989843e-05,
887
+ "loss": 0.6445,
888
+ "mean_token_accuracy": 0.7758039116859436,
889
+ "num_tokens": 1229248.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.2624098420143128,
894
+ "epoch": 1.0206422018348624,
895
+ "grad_norm": 1.9808402061462402,
896
+ "learning_rate": 9.926398342057577e-05,
897
+ "loss": 0.492,
898
+ "mean_token_accuracy": 0.8236800074577332,
899
+ "num_tokens": 1243088.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.252714467048645,
904
+ "epoch": 1.0321100917431192,
905
+ "grad_norm": 2.2568917274475098,
906
+ "learning_rate": 9.923121398703504e-05,
907
+ "loss": 0.4861,
908
+ "mean_token_accuracy": 0.8282331109046936,
909
+ "num_tokens": 1256681.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.2762907862663269,
914
+ "epoch": 1.0435779816513762,
915
+ "grad_norm": 1.7591499090194702,
916
+ "learning_rate": 9.9197736510566e-05,
917
+ "loss": 0.5326,
918
+ "mean_token_accuracy": 0.8061232268810272,
919
+ "num_tokens": 1270563.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.2779451608657837,
924
+ "epoch": 1.0550458715596331,
925
+ "grad_norm": 1.7618857622146606,
926
+ "learning_rate": 9.916355147264142e-05,
927
+ "loss": 0.5762,
928
+ "mean_token_accuracy": 0.7888909459114075,
929
+ "num_tokens": 1284789.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.3000144004821776,
934
+ "epoch": 1.06651376146789,
935
+ "grad_norm": 1.929226040840149,
936
+ "learning_rate": 9.912865936491026e-05,
937
+ "loss": 0.556,
938
+ "mean_token_accuracy": 0.7985962986946106,
939
+ "num_tokens": 1298314.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.2920597314834594,
944
+ "epoch": 1.0779816513761469,
945
+ "grad_norm": 2.1356875896453857,
946
+ "learning_rate": 9.909306068919055e-05,
947
+ "loss": 0.5872,
948
+ "mean_token_accuracy": 0.7914662003517151,
949
+ "num_tokens": 1312524.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.3042231440544128,
954
+ "epoch": 1.0894495412844036,
955
+ "grad_norm": 2.148797035217285,
956
+ "learning_rate": 9.905675595746215e-05,
957
+ "loss": 0.5507,
958
+ "mean_token_accuracy": 0.802655827999115,
959
+ "num_tokens": 1326952.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 1.277776312828064,
964
+ "epoch": 1.1009174311926606,
965
+ "grad_norm": 1.6280494928359985,
966
+ "learning_rate": 9.901974569185941e-05,
967
+ "loss": 0.5579,
968
+ "mean_token_accuracy": 0.8001268386840821,
969
+ "num_tokens": 1341302.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 1.2962275981903075,
974
+ "epoch": 1.1123853211009174,
975
+ "grad_norm": 1.8065513372421265,
976
+ "learning_rate": 9.898203042466368e-05,
977
+ "loss": 0.5492,
978
+ "mean_token_accuracy": 0.8058996260166168,
979
+ "num_tokens": 1355689.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 1.2893213629722595,
984
+ "epoch": 1.1238532110091743,
985
+ "grad_norm": 1.864761233329773,
986
+ "learning_rate": 9.894361069829565e-05,
987
+ "loss": 0.5292,
988
+ "mean_token_accuracy": 0.8077204465866089,
989
+ "num_tokens": 1369850.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 1.2918407797813416,
994
+ "epoch": 1.135321100917431,
995
+ "grad_norm": 2.276775598526001,
996
+ "learning_rate": 9.89044870653075e-05,
997
+ "loss": 0.564,
998
+ "mean_token_accuracy": 0.7952383041381836,
999
+ "num_tokens": 1384054.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 1.281248104572296,
1004
+ "epoch": 1.146788990825688,
1005
+ "grad_norm": 2.1157305240631104,
1006
+ "learning_rate": 9.886466008837503e-05,
1007
+ "loss": 0.5706,
1008
+ "mean_token_accuracy": 0.7949798464775085,
1009
+ "num_tokens": 1398492.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 1.2710728526115418,
1014
+ "epoch": 1.158256880733945,
1015
+ "grad_norm": 1.8817031383514404,
1016
+ "learning_rate": 9.882413034028948e-05,
1017
+ "loss": 0.516,
1018
+ "mean_token_accuracy": 0.8137441635131836,
1019
+ "num_tokens": 1412100.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 1.2870657205581666,
1024
+ "epoch": 1.1697247706422018,
1025
+ "grad_norm": 1.7975279092788696,
1026
+ "learning_rate": 9.878289840394938e-05,
1027
+ "loss": 0.5374,
1028
+ "mean_token_accuracy": 0.8032542705535889,
1029
+ "num_tokens": 1425770.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 1.2666459918022155,
1034
+ "epoch": 1.1811926605504588,
1035
+ "grad_norm": 2.47218656539917,
1036
+ "learning_rate": 9.874096487235212e-05,
1037
+ "loss": 0.5158,
1038
+ "mean_token_accuracy": 0.8173266768455505,
1039
+ "num_tokens": 1439309.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 1.3137032628059386,
1044
+ "epoch": 1.1926605504587156,
1045
+ "grad_norm": 1.7813074588775635,
1046
+ "learning_rate": 9.869833034858538e-05,
1047
+ "loss": 0.5324,
1048
+ "mean_token_accuracy": 0.8099446773529053,
1049
+ "num_tokens": 1454541.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 1.2864318251609803,
1054
+ "epoch": 1.2041284403669725,
1055
+ "grad_norm": 1.9276366233825684,
1056
+ "learning_rate": 9.86549954458186e-05,
1057
+ "loss": 0.5554,
1058
+ "mean_token_accuracy": 0.8048118472099304,
1059
+ "num_tokens": 1468346.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 1.2949382424354554,
1064
+ "epoch": 1.2155963302752293,
1065
+ "grad_norm": 1.9171100854873657,
1066
+ "learning_rate": 9.861096078729396e-05,
1067
+ "loss": 0.5857,
1068
+ "mean_token_accuracy": 0.7923648238182068,
1069
+ "num_tokens": 1482839.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 1.2825786828994752,
1074
+ "epoch": 1.2270642201834863,
1075
+ "grad_norm": 1.458295226097107,
1076
+ "learning_rate": 9.85662270063176e-05,
1077
+ "loss": 0.5344,
1078
+ "mean_token_accuracy": 0.8081244885921478,
1079
+ "num_tokens": 1496532.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 1.2934918642044066,
1084
+ "epoch": 1.238532110091743,
1085
+ "grad_norm": 2.2048583030700684,
1086
+ "learning_rate": 9.852079474625035e-05,
1087
+ "loss": 0.5802,
1088
+ "mean_token_accuracy": 0.7943230092525482,
1089
+ "num_tokens": 1510406.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 1.3103590607643127,
1094
+ "epoch": 1.25,
1095
+ "grad_norm": 2.103316307067871,
1096
+ "learning_rate": 9.847466466049868e-05,
1097
+ "loss": 0.5761,
1098
+ "mean_token_accuracy": 0.7919000566005707,
1099
+ "num_tokens": 1524582.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 1.2943686366081237,
1104
+ "epoch": 1.261467889908257,
1105
+ "grad_norm": 1.8935585021972656,
1106
+ "learning_rate": 9.84278374125051e-05,
1107
+ "loss": 0.5668,
1108
+ "mean_token_accuracy": 0.795119684934616,
1109
+ "num_tokens": 1538645.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 1.2833523988723754,
1114
+ "epoch": 1.2729357798165137,
1115
+ "grad_norm": 1.5310587882995605,
1116
+ "learning_rate": 9.838031367573868e-05,
1117
+ "loss": 0.4791,
1118
+ "mean_token_accuracy": 0.8290136575698852,
1119
+ "num_tokens": 1552198.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 1.2810697436332703,
1124
+ "epoch": 1.2844036697247707,
1125
+ "grad_norm": 1.9493242502212524,
1126
+ "learning_rate": 9.833209413368546e-05,
1127
+ "loss": 0.5479,
1128
+ "mean_token_accuracy": 0.7984305679798126,
1129
+ "num_tokens": 1566248.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 1.2971422672271729,
1134
+ "epoch": 1.2958715596330275,
1135
+ "grad_norm": 2.143052816390991,
1136
+ "learning_rate": 9.828317947983851e-05,
1137
+ "loss": 0.5556,
1138
+ "mean_token_accuracy": 0.7962001860141754,
1139
+ "num_tokens": 1579657.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 1.2938915967941285,
1144
+ "epoch": 1.3073394495412844,
1145
+ "grad_norm": 3.074519395828247,
1146
+ "learning_rate": 9.823357041768797e-05,
1147
+ "loss": 0.5808,
1148
+ "mean_token_accuracy": 0.7921633243560791,
1149
+ "num_tokens": 1594362.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 1.3013799428939818,
1154
+ "epoch": 1.3188073394495412,
1155
+ "grad_norm": 2.1249051094055176,
1156
+ "learning_rate": 9.8183267660711e-05,
1157
+ "loss": 0.5679,
1158
+ "mean_token_accuracy": 0.7960763275623322,
1159
+ "num_tokens": 1607995.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 1.2755417585372926,
1164
+ "epoch": 1.3302752293577982,
1165
+ "grad_norm": 1.7334320545196533,
1166
+ "learning_rate": 9.813227193236144e-05,
1167
+ "loss": 0.5211,
1168
+ "mean_token_accuracy": 0.8171180784702301,
1169
+ "num_tokens": 1621183.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 1.300136685371399,
1174
+ "epoch": 1.341743119266055,
1175
+ "grad_norm": 1.604264259338379,
1176
+ "learning_rate": 9.808058396605945e-05,
1177
+ "loss": 0.5622,
1178
+ "mean_token_accuracy": 0.7956745982170105,
1179
+ "num_tokens": 1634961.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 1.2956653475761413,
1184
+ "epoch": 1.353211009174312,
1185
+ "grad_norm": 2.304135322570801,
1186
+ "learning_rate": 9.802820450518095e-05,
1187
+ "loss": 0.5919,
1188
+ "mean_token_accuracy": 0.7799835622310638,
1189
+ "num_tokens": 1648959.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 1.3270721554756164,
1194
+ "epoch": 1.364678899082569,
1195
+ "grad_norm": 2.304185390472412,
1196
+ "learning_rate": 9.797513430304695e-05,
1197
+ "loss": 0.6347,
1198
+ "mean_token_accuracy": 0.7729239940643311,
1199
+ "num_tokens": 1662218.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 1.3200181603431702,
1204
+ "epoch": 1.3761467889908257,
1205
+ "grad_norm": 2.673722743988037,
1206
+ "learning_rate": 9.792137412291265e-05,
1207
+ "loss": 0.6568,
1208
+ "mean_token_accuracy": 0.7654553771018981,
1209
+ "num_tokens": 1675320.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 1.3001809120178223,
1214
+ "epoch": 1.3876146788990826,
1215
+ "grad_norm": 1.8785172700881958,
1216
+ "learning_rate": 9.786692473795654e-05,
1217
+ "loss": 0.5498,
1218
+ "mean_token_accuracy": 0.7971892893314362,
1219
+ "num_tokens": 1688732.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 1.2927094459533692,
1224
+ "epoch": 1.3990825688073394,
1225
+ "grad_norm": 2.299051284790039,
1226
+ "learning_rate": 9.781178693126923e-05,
1227
+ "loss": 0.5317,
1228
+ "mean_token_accuracy": 0.812885046005249,
1229
+ "num_tokens": 1702489.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 1.2940443515777589,
1234
+ "epoch": 1.4105504587155964,
1235
+ "grad_norm": 2.107447385787964,
1236
+ "learning_rate": 9.775596149584226e-05,
1237
+ "loss": 0.5408,
1238
+ "mean_token_accuracy": 0.8026755452156067,
1239
+ "num_tokens": 1717066.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 1.2880491733551025,
1244
+ "epoch": 1.4220183486238533,
1245
+ "grad_norm": 2.120649814605713,
1246
+ "learning_rate": 9.769944923455654e-05,
1247
+ "loss": 0.5122,
1248
+ "mean_token_accuracy": 0.8185527265071869,
1249
+ "num_tokens": 1730503.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 1.2935888648033143,
1254
+ "epoch": 1.43348623853211,
1255
+ "grad_norm": 1.8897229433059692,
1256
+ "learning_rate": 9.764225096017102e-05,
1257
+ "loss": 0.5891,
1258
+ "mean_token_accuracy": 0.7794159233570099,
1259
+ "num_tokens": 1744257.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 1.2713160991668702,
1264
+ "epoch": 1.4449541284403669,
1265
+ "grad_norm": 1.9189554452896118,
1266
+ "learning_rate": 9.758436749531079e-05,
1267
+ "loss": 0.5146,
1268
+ "mean_token_accuracy": 0.818141633272171,
1269
+ "num_tokens": 1758267.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 1.2798304796218871,
1274
+ "epoch": 1.4564220183486238,
1275
+ "grad_norm": 2.2521767616271973,
1276
+ "learning_rate": 9.752579967245538e-05,
1277
+ "loss": 0.5959,
1278
+ "mean_token_accuracy": 0.7902258694171905,
1279
+ "num_tokens": 1771990.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 1.296580719947815,
1284
+ "epoch": 1.4678899082568808,
1285
+ "grad_norm": 1.5478334426879883,
1286
+ "learning_rate": 9.746654833392677e-05,
1287
+ "loss": 0.5636,
1288
+ "mean_token_accuracy": 0.8009288847446442,
1289
+ "num_tokens": 1786045.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 1.2467906951904297,
1294
+ "epoch": 1.4793577981651376,
1295
+ "grad_norm": 1.8531265258789062,
1296
+ "learning_rate": 9.740661433187725e-05,
1297
+ "loss": 0.4514,
1298
+ "mean_token_accuracy": 0.8369600057601929,
1299
+ "num_tokens": 1800019.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 1.2813060760498047,
1304
+ "epoch": 1.4908256880733946,
1305
+ "grad_norm": 2.007786512374878,
1306
+ "learning_rate": 9.734599852827712e-05,
1307
+ "loss": 0.5587,
1308
+ "mean_token_accuracy": 0.8045243263244629,
1309
+ "num_tokens": 1814394.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 1.2923226833343506,
1314
+ "epoch": 1.5022935779816513,
1315
+ "grad_norm": 2.0562584400177,
1316
+ "learning_rate": 9.728470179490244e-05,
1317
+ "loss": 0.563,
1318
+ "mean_token_accuracy": 0.79967080950737,
1319
+ "num_tokens": 1827604.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 1.28248028755188,
1324
+ "epoch": 1.5137614678899083,
1325
+ "grad_norm": 1.8021918535232544,
1326
+ "learning_rate": 9.72227250133223e-05,
1327
+ "loss": 0.5535,
1328
+ "mean_token_accuracy": 0.8028985977172851,
1329
+ "num_tokens": 1841751.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 1.2800176739692688,
1334
+ "epoch": 1.5252293577981653,
1335
+ "grad_norm": 2.0901622772216797,
1336
+ "learning_rate": 9.71600690748863e-05,
1337
+ "loss": 0.5889,
1338
+ "mean_token_accuracy": 0.7968101024627685,
1339
+ "num_tokens": 1856403.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 1.2775539755821228,
1344
+ "epoch": 1.536697247706422,
1345
+ "grad_norm": 1.9024734497070312,
1346
+ "learning_rate": 9.709673488071163e-05,
1347
+ "loss": 0.5529,
1348
+ "mean_token_accuracy": 0.7998219549655914,
1349
+ "num_tokens": 1870952.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 1.3066880822181701,
1354
+ "epoch": 1.5481651376146788,
1355
+ "grad_norm": 2.2026913166046143,
1356
+ "learning_rate": 9.70327233416702e-05,
1357
+ "loss": 0.6146,
1358
+ "mean_token_accuracy": 0.7799036145210266,
1359
+ "num_tokens": 1884850.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 1.2854471683502198,
1364
+ "epoch": 1.5596330275229358,
1365
+ "grad_norm": 1.995058298110962,
1366
+ "learning_rate": 9.696803537837542e-05,
1367
+ "loss": 0.5744,
1368
+ "mean_token_accuracy": 0.7955298364162445,
1369
+ "num_tokens": 1898895.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 1.2856696963310241,
1374
+ "epoch": 1.5711009174311927,
1375
+ "grad_norm": 1.913603663444519,
1376
+ "learning_rate": 9.690267192116908e-05,
1377
+ "loss": 0.525,
1378
+ "mean_token_accuracy": 0.8169679343700409,
1379
+ "num_tokens": 1913026.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 1.3183680534362794,
1384
+ "epoch": 1.5825688073394495,
1385
+ "grad_norm": 2.7248916625976562,
1386
+ "learning_rate": 9.683663391010791e-05,
1387
+ "loss": 0.6482,
1388
+ "mean_token_accuracy": 0.7678777754306794,
1389
+ "num_tokens": 1927053.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 1.298743522167206,
1394
+ "epoch": 1.5940366972477065,
1395
+ "grad_norm": 2.011831521987915,
1396
+ "learning_rate": 9.676992229495004e-05,
1397
+ "loss": 0.577,
1398
+ "mean_token_accuracy": 0.7876397609710694,
1399
+ "num_tokens": 1940596.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 1.294689130783081,
1404
+ "epoch": 1.6055045871559632,
1405
+ "grad_norm": 2.2598249912261963,
1406
+ "learning_rate": 9.670253803514142e-05,
1407
+ "loss": 0.5746,
1408
+ "mean_token_accuracy": 0.7938637971878052,
1409
+ "num_tokens": 1955635.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 1.3118200659751893,
1414
+ "epoch": 1.6169724770642202,
1415
+ "grad_norm": 1.9109872579574585,
1416
+ "learning_rate": 9.66344820998019e-05,
1417
+ "loss": 0.5996,
1418
+ "mean_token_accuracy": 0.7869695067405701,
1419
+ "num_tokens": 1970187.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 1.2969690084457397,
1424
+ "epoch": 1.6284403669724772,
1425
+ "grad_norm": 2.021652936935425,
1426
+ "learning_rate": 9.656575546771144e-05,
1427
+ "loss": 0.5692,
1428
+ "mean_token_accuracy": 0.7921172618865967,
1429
+ "num_tokens": 1983963.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 1.3053216218948365,
1434
+ "epoch": 1.639908256880734,
1435
+ "grad_norm": 2.056626081466675,
1436
+ "learning_rate": 9.649635912729589e-05,
1437
+ "loss": 0.5534,
1438
+ "mean_token_accuracy": 0.7994763553142548,
1439
+ "num_tokens": 1997426.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 1.307614517211914,
1444
+ "epoch": 1.6513761467889907,
1445
+ "grad_norm": 2.0294957160949707,
1446
+ "learning_rate": 9.642629407661288e-05,
1447
+ "loss": 0.6113,
1448
+ "mean_token_accuracy": 0.7812033116817474,
1449
+ "num_tokens": 2011810.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 1.2840725421905517,
1454
+ "epoch": 1.6628440366972477,
1455
+ "grad_norm": 2.376054525375366,
1456
+ "learning_rate": 9.63555613233374e-05,
1457
+ "loss": 0.5333,
1458
+ "mean_token_accuracy": 0.8069488048553467,
1459
+ "num_tokens": 2025702.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 1.2848711609840393,
1464
+ "epoch": 1.6743119266055047,
1465
+ "grad_norm": 2.387098550796509,
1466
+ "learning_rate": 9.628416188474735e-05,
1467
+ "loss": 0.5295,
1468
+ "mean_token_accuracy": 0.8113990724086761,
1469
+ "num_tokens": 2040039.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 1.3038938522338868,
1474
+ "epoch": 1.6857798165137616,
1475
+ "grad_norm": 2.6049790382385254,
1476
+ "learning_rate": 9.621209678770889e-05,
1477
+ "loss": 0.5902,
1478
+ "mean_token_accuracy": 0.7839356422424316,
1479
+ "num_tokens": 2054883.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 1.3001854181289674,
1484
+ "epoch": 1.6972477064220184,
1485
+ "grad_norm": 2.08150577545166,
1486
+ "learning_rate": 9.613936706866168e-05,
1487
+ "loss": 0.5804,
1488
+ "mean_token_accuracy": 0.7912817001342773,
1489
+ "num_tokens": 2068892.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 1.2911452770233154,
1494
+ "epoch": 1.7087155963302751,
1495
+ "grad_norm": 2.2386717796325684,
1496
+ "learning_rate": 9.606597377360396e-05,
1497
+ "loss": 0.5902,
1498
+ "mean_token_accuracy": 0.7858116149902343,
1499
+ "num_tokens": 2083075.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 1.2923203349113463,
1504
+ "epoch": 1.7201834862385321,
1505
+ "grad_norm": 1.9360357522964478,
1506
+ "learning_rate": 9.59919179580775e-05,
1507
+ "loss": 0.5931,
1508
+ "mean_token_accuracy": 0.7880455732345581,
1509
+ "num_tokens": 2097088.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.2811247110366821,
1514
+ "epoch": 1.731651376146789,
1515
+ "grad_norm": 2.346832275390625,
1516
+ "learning_rate": 9.591720068715247e-05,
1517
+ "loss": 0.5381,
1518
+ "mean_token_accuracy": 0.8110429465770721,
1519
+ "num_tokens": 2110713.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.2997817516326904,
1524
+ "epoch": 1.7431192660550459,
1525
+ "grad_norm": 2.1013338565826416,
1526
+ "learning_rate": 9.584182303541205e-05,
1527
+ "loss": 0.5771,
1528
+ "mean_token_accuracy": 0.7898500382900238,
1529
+ "num_tokens": 2124467.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.283075988292694,
1534
+ "epoch": 1.7545871559633026,
1535
+ "grad_norm": 1.718410849571228,
1536
+ "learning_rate": 9.576578608693703e-05,
1537
+ "loss": 0.5545,
1538
+ "mean_token_accuracy": 0.8036096036434174,
1539
+ "num_tokens": 2139017.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.2541950225830079,
1544
+ "epoch": 1.7660550458715596,
1545
+ "grad_norm": 2.381345510482788,
1546
+ "learning_rate": 9.568909093529022e-05,
1547
+ "loss": 0.5071,
1548
+ "mean_token_accuracy": 0.8172869801521301,
1549
+ "num_tokens": 2153212.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.2600136041641234,
1554
+ "epoch": 1.7775229357798166,
1555
+ "grad_norm": 1.9568657875061035,
1556
+ "learning_rate": 9.561173868350067e-05,
1557
+ "loss": 0.5251,
1558
+ "mean_token_accuracy": 0.8089884519577026,
1559
+ "num_tokens": 2167190.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.2688735485076905,
1564
+ "epoch": 1.7889908256880735,
1565
+ "grad_norm": 2.0126872062683105,
1566
+ "learning_rate": 9.553373044404783e-05,
1567
+ "loss": 0.5563,
1568
+ "mean_token_accuracy": 0.8013049483299255,
1569
+ "num_tokens": 2181135.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.2632331728935242,
1574
+ "epoch": 1.8004587155963303,
1575
+ "grad_norm": 1.7177560329437256,
1576
+ "learning_rate": 9.54550673388456e-05,
1577
+ "loss": 0.5456,
1578
+ "mean_token_accuracy": 0.8039442837238312,
1579
+ "num_tokens": 2195099.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.2656291127204895,
1584
+ "epoch": 1.811926605504587,
1585
+ "grad_norm": 2.6126630306243896,
1586
+ "learning_rate": 9.537575049922613e-05,
1587
+ "loss": 0.5516,
1588
+ "mean_token_accuracy": 0.7961392283439637,
1589
+ "num_tokens": 2209220.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.278434193134308,
1594
+ "epoch": 1.823394495412844,
1595
+ "grad_norm": 2.216356039047241,
1596
+ "learning_rate": 9.52957810659236e-05,
1597
+ "loss": 0.548,
1598
+ "mean_token_accuracy": 0.7977044761180878,
1599
+ "num_tokens": 2222873.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.285041868686676,
1604
+ "epoch": 1.834862385321101,
1605
+ "grad_norm": 2.2278988361358643,
1606
+ "learning_rate": 9.521516018905771e-05,
1607
+ "loss": 0.5905,
1608
+ "mean_token_accuracy": 0.7802383601665497,
1609
+ "num_tokens": 2237054.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.2938857316970824,
1614
+ "epoch": 1.8463302752293578,
1615
+ "grad_norm": 2.0378856658935547,
1616
+ "learning_rate": 9.513388902811733e-05,
1617
+ "loss": 0.6033,
1618
+ "mean_token_accuracy": 0.7891092479228974,
1619
+ "num_tokens": 2250581.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.2730875372886659,
1624
+ "epoch": 1.8577981651376145,
1625
+ "grad_norm": 1.9576410055160522,
1626
+ "learning_rate": 9.505196875194362e-05,
1627
+ "loss": 0.5709,
1628
+ "mean_token_accuracy": 0.7948619246482849,
1629
+ "num_tokens": 2264352.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.2942588448524475,
1634
+ "epoch": 1.8692660550458715,
1635
+ "grad_norm": 3.2486989498138428,
1636
+ "learning_rate": 9.496940053871333e-05,
1637
+ "loss": 0.5695,
1638
+ "mean_token_accuracy": 0.7931654870510101,
1639
+ "num_tokens": 2278395.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.2859179735183717,
1644
+ "epoch": 1.8807339449541285,
1645
+ "grad_norm": 1.7161357402801514,
1646
+ "learning_rate": 9.488618557592187e-05,
1647
+ "loss": 0.5588,
1648
+ "mean_token_accuracy": 0.7988445639610291,
1649
+ "num_tokens": 2292458.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.287862777709961,
1654
+ "epoch": 1.8922018348623855,
1655
+ "grad_norm": 1.7279341220855713,
1656
+ "learning_rate": 9.480232506036618e-05,
1657
+ "loss": 0.5718,
1658
+ "mean_token_accuracy": 0.7963582694530487,
1659
+ "num_tokens": 2305950.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.2868569016456604,
1664
+ "epoch": 1.9036697247706422,
1665
+ "grad_norm": 1.7532700300216675,
1666
+ "learning_rate": 9.471782019812748e-05,
1667
+ "loss": 0.5739,
1668
+ "mean_token_accuracy": 0.7951330602169037,
1669
+ "num_tokens": 2320092.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.2931817889213562,
1674
+ "epoch": 1.915137614678899,
1675
+ "grad_norm": 2.7232377529144287,
1676
+ "learning_rate": 9.463267220455408e-05,
1677
+ "loss": 0.5996,
1678
+ "mean_token_accuracy": 0.7812487840652466,
1679
+ "num_tokens": 2334035.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.2847351789474488,
1684
+ "epoch": 1.926605504587156,
1685
+ "grad_norm": 2.1023809909820557,
1686
+ "learning_rate": 9.454688230424372e-05,
1687
+ "loss": 0.5516,
1688
+ "mean_token_accuracy": 0.8027086973190307,
1689
+ "num_tokens": 2348205.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.2944233417510986,
1694
+ "epoch": 1.938073394495413,
1695
+ "grad_norm": 2.789158582687378,
1696
+ "learning_rate": 9.446045173102607e-05,
1697
+ "loss": 0.6096,
1698
+ "mean_token_accuracy": 0.7904924273490905,
1699
+ "num_tokens": 2362411.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.296918225288391,
1704
+ "epoch": 1.9495412844036697,
1705
+ "grad_norm": 2.8648757934570312,
1706
+ "learning_rate": 9.437338172794495e-05,
1707
+ "loss": 0.5851,
1708
+ "mean_token_accuracy": 0.7826291382312774,
1709
+ "num_tokens": 2376229.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.2416040658950807,
1714
+ "epoch": 1.9610091743119265,
1715
+ "grad_norm": 2.146327257156372,
1716
+ "learning_rate": 9.428567354724047e-05,
1717
+ "loss": 0.5003,
1718
+ "mean_token_accuracy": 0.8209156513214111,
1719
+ "num_tokens": 2389870.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.299118459224701,
1724
+ "epoch": 1.9724770642201834,
1725
+ "grad_norm": 1.9699536561965942,
1726
+ "learning_rate": 9.419732845033093e-05,
1727
+ "loss": 0.5857,
1728
+ "mean_token_accuracy": 0.7884073138237,
1729
+ "num_tokens": 2403887.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.307542335987091,
1734
+ "epoch": 1.9839449541284404,
1735
+ "grad_norm": 2.541121006011963,
1736
+ "learning_rate": 9.410834770779489e-05,
1737
+ "loss": 0.6299,
1738
+ "mean_token_accuracy": 0.7736253619194031,
1739
+ "num_tokens": 2418109.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.2949981808662414,
1744
+ "epoch": 1.9954128440366974,
1745
+ "grad_norm": 1.7402102947235107,
1746
+ "learning_rate": 9.401873259935261e-05,
1747
+ "loss": 0.5928,
1748
+ "mean_token_accuracy": 0.7905942320823669,
1749
+ "num_tokens": 2432561.0,
1750
+ "step": 1740
1751
+ }
1752
+ ],
1753
+ "logging_steps": 10,
1754
+ "max_steps": 8720,
1755
+ "num_input_tokens_seen": 0,
1756
+ "num_train_epochs": 10,
1757
+ "save_steps": 500,
1758
+ "stateful_callbacks": {
1759
+ "TrainerControl": {
1760
+ "args": {
1761
+ "should_epoch_stop": false,
1762
+ "should_evaluate": false,
1763
+ "should_log": false,
1764
+ "should_save": true,
1765
+ "should_training_stop": false
1766
+ },
1767
+ "attributes": {}
1768
+ }
1769
+ },
1770
+ "total_flos": 1.036845473218007e+17,
1771
+ "train_batch_size": 8,
1772
+ "trial_name": null,
1773
+ "trial_params": null
1774
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-1744/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "gate_proj",
39
+ "q_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:719f5fb00afaa0b7a771b34e32c90e065e8a706f3a2f57195703efc1935853dc
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/trainer_state.json ADDED
@@ -0,0 +1,2644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2616,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2426587224006653,
14
+ "epoch": 0.011467889908256881,
15
+ "grad_norm": 3.028918981552124,
16
+ "learning_rate": 2.0642201834862385e-06,
17
+ "loss": 0.6546,
18
+ "mean_token_accuracy": 0.7775660812854767,
19
+ "num_tokens": 14294.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2369988679885864,
24
+ "epoch": 0.022935779816513763,
25
+ "grad_norm": 2.726299524307251,
26
+ "learning_rate": 4.357798165137615e-06,
27
+ "loss": 0.6567,
28
+ "mean_token_accuracy": 0.7740493595600129,
29
+ "num_tokens": 28037.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2392263770103455,
34
+ "epoch": 0.034403669724770644,
35
+ "grad_norm": 2.2062387466430664,
36
+ "learning_rate": 6.651376146788992e-06,
37
+ "loss": 0.6411,
38
+ "mean_token_accuracy": 0.7753027558326722,
39
+ "num_tokens": 42268.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.2435187816619873,
44
+ "epoch": 0.045871559633027525,
45
+ "grad_norm": 2.4188315868377686,
46
+ "learning_rate": 8.944954128440369e-06,
47
+ "loss": 0.6271,
48
+ "mean_token_accuracy": 0.7827982664108276,
49
+ "num_tokens": 56474.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.261301326751709,
54
+ "epoch": 0.05733944954128441,
55
+ "grad_norm": 2.490365982055664,
56
+ "learning_rate": 1.1238532110091744e-05,
57
+ "loss": 0.6343,
58
+ "mean_token_accuracy": 0.7792274355888367,
59
+ "num_tokens": 70348.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.2830254554748535,
64
+ "epoch": 0.06880733944954129,
65
+ "grad_norm": 2.6820173263549805,
66
+ "learning_rate": 1.3532110091743119e-05,
67
+ "loss": 0.6298,
68
+ "mean_token_accuracy": 0.7811978995800019,
69
+ "num_tokens": 84857.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.275150680541992,
74
+ "epoch": 0.08027522935779817,
75
+ "grad_norm": 1.8796888589859009,
76
+ "learning_rate": 1.5825688073394497e-05,
77
+ "loss": 0.6689,
78
+ "mean_token_accuracy": 0.7587406218051911,
79
+ "num_tokens": 99396.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.2276832342147828,
84
+ "epoch": 0.09174311926605505,
85
+ "grad_norm": 1.690596580505371,
86
+ "learning_rate": 1.811926605504587e-05,
87
+ "loss": 0.5586,
88
+ "mean_token_accuracy": 0.8031339049339294,
89
+ "num_tokens": 112619.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.2304488182067872,
94
+ "epoch": 0.10321100917431193,
95
+ "grad_norm": 1.7719478607177734,
96
+ "learning_rate": 2.0412844036697248e-05,
97
+ "loss": 0.6093,
98
+ "mean_token_accuracy": 0.7849110841751099,
99
+ "num_tokens": 126496.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.2812824487686156,
104
+ "epoch": 0.11467889908256881,
105
+ "grad_norm": 1.5947109460830688,
106
+ "learning_rate": 2.2706422018348624e-05,
107
+ "loss": 0.6455,
108
+ "mean_token_accuracy": 0.7670063555240632,
109
+ "num_tokens": 140628.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.2834580063819885,
114
+ "epoch": 0.12614678899082568,
115
+ "grad_norm": 1.8809813261032104,
116
+ "learning_rate": 2.5e-05,
117
+ "loss": 0.6489,
118
+ "mean_token_accuracy": 0.7716078400611878,
119
+ "num_tokens": 153959.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.2611944794654846,
124
+ "epoch": 0.13761467889908258,
125
+ "grad_norm": 1.780765414237976,
126
+ "learning_rate": 2.7293577981651375e-05,
127
+ "loss": 0.5445,
128
+ "mean_token_accuracy": 0.8067175269126892,
129
+ "num_tokens": 168207.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.272983717918396,
134
+ "epoch": 0.14908256880733944,
135
+ "grad_norm": 2.119795560836792,
136
+ "learning_rate": 2.9587155963302755e-05,
137
+ "loss": 0.6021,
138
+ "mean_token_accuracy": 0.7827390134334564,
139
+ "num_tokens": 182042.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.2775203227996825,
144
+ "epoch": 0.16055045871559634,
145
+ "grad_norm": 1.4455509185791016,
146
+ "learning_rate": 3.188073394495413e-05,
147
+ "loss": 0.5715,
148
+ "mean_token_accuracy": 0.7992757976055145,
149
+ "num_tokens": 196015.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.2987091898918153,
154
+ "epoch": 0.1720183486238532,
155
+ "grad_norm": 1.4980850219726562,
156
+ "learning_rate": 3.4174311926605505e-05,
157
+ "loss": 0.6023,
158
+ "mean_token_accuracy": 0.7867661654949188,
159
+ "num_tokens": 210215.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.2755884647369384,
164
+ "epoch": 0.1834862385321101,
165
+ "grad_norm": 1.8361093997955322,
166
+ "learning_rate": 3.646788990825688e-05,
167
+ "loss": 0.6042,
168
+ "mean_token_accuracy": 0.7830365121364593,
169
+ "num_tokens": 224017.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.2698514223098756,
174
+ "epoch": 0.19495412844036697,
175
+ "grad_norm": 1.3812596797943115,
176
+ "learning_rate": 3.876146788990826e-05,
177
+ "loss": 0.5846,
178
+ "mean_token_accuracy": 0.7849388122558594,
179
+ "num_tokens": 237482.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.244412088394165,
184
+ "epoch": 0.20642201834862386,
185
+ "grad_norm": 1.798601746559143,
186
+ "learning_rate": 4.1055045871559636e-05,
187
+ "loss": 0.547,
188
+ "mean_token_accuracy": 0.8001754701137542,
189
+ "num_tokens": 251169.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.2745447516441346,
194
+ "epoch": 0.21788990825688073,
195
+ "grad_norm": 1.9844188690185547,
196
+ "learning_rate": 4.334862385321101e-05,
197
+ "loss": 0.5623,
198
+ "mean_token_accuracy": 0.7987869799137115,
199
+ "num_tokens": 264764.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.3000339031219483,
204
+ "epoch": 0.22935779816513763,
205
+ "grad_norm": 1.4861323833465576,
206
+ "learning_rate": 4.564220183486239e-05,
207
+ "loss": 0.6123,
208
+ "mean_token_accuracy": 0.7852487206459046,
209
+ "num_tokens": 278504.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.3115605235099792,
214
+ "epoch": 0.2408256880733945,
215
+ "grad_norm": 1.8251842260360718,
216
+ "learning_rate": 4.7935779816513766e-05,
217
+ "loss": 0.658,
218
+ "mean_token_accuracy": 0.759744155406952,
219
+ "num_tokens": 293021.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.3224670886993408,
224
+ "epoch": 0.25229357798165136,
225
+ "grad_norm": 1.5895862579345703,
226
+ "learning_rate": 5.022935779816514e-05,
227
+ "loss": 0.6213,
228
+ "mean_token_accuracy": 0.7748652398586273,
229
+ "num_tokens": 306811.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.2993175029754638,
234
+ "epoch": 0.26376146788990823,
235
+ "grad_norm": 1.6444000005722046,
236
+ "learning_rate": 5.252293577981652e-05,
237
+ "loss": 0.5723,
238
+ "mean_token_accuracy": 0.7920855104923248,
239
+ "num_tokens": 320873.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.3019076824188232,
244
+ "epoch": 0.27522935779816515,
245
+ "grad_norm": 1.5319479703903198,
246
+ "learning_rate": 5.481651376146789e-05,
247
+ "loss": 0.5945,
248
+ "mean_token_accuracy": 0.7845574736595153,
249
+ "num_tokens": 335323.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.289823544025421,
254
+ "epoch": 0.286697247706422,
255
+ "grad_norm": 1.2900819778442383,
256
+ "learning_rate": 5.7110091743119266e-05,
257
+ "loss": 0.5692,
258
+ "mean_token_accuracy": 0.7950143396854401,
259
+ "num_tokens": 349575.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.2885443449020386,
264
+ "epoch": 0.2981651376146789,
265
+ "grad_norm": 1.408677577972412,
266
+ "learning_rate": 5.940366972477065e-05,
267
+ "loss": 0.6104,
268
+ "mean_token_accuracy": 0.7816850125789643,
269
+ "num_tokens": 363961.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.3065945267677308,
274
+ "epoch": 0.30963302752293576,
275
+ "grad_norm": 1.3809661865234375,
276
+ "learning_rate": 6.169724770642203e-05,
277
+ "loss": 0.6426,
278
+ "mean_token_accuracy": 0.7661891877651215,
279
+ "num_tokens": 377555.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.2889371395111084,
284
+ "epoch": 0.3211009174311927,
285
+ "grad_norm": 1.4974966049194336,
286
+ "learning_rate": 6.39908256880734e-05,
287
+ "loss": 0.5882,
288
+ "mean_token_accuracy": 0.7819968700408936,
289
+ "num_tokens": 391423.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.2952162742614746,
294
+ "epoch": 0.33256880733944955,
295
+ "grad_norm": 1.3621913194656372,
296
+ "learning_rate": 6.628440366972477e-05,
297
+ "loss": 0.57,
298
+ "mean_token_accuracy": 0.7946897804737091,
299
+ "num_tokens": 405650.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.2888988494873046,
304
+ "epoch": 0.3440366972477064,
305
+ "grad_norm": 1.793961524963379,
306
+ "learning_rate": 6.857798165137616e-05,
307
+ "loss": 0.6273,
308
+ "mean_token_accuracy": 0.7732390701770783,
309
+ "num_tokens": 419332.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.289334809780121,
314
+ "epoch": 0.3555045871559633,
315
+ "grad_norm": 1.5518903732299805,
316
+ "learning_rate": 7.087155963302753e-05,
317
+ "loss": 0.6492,
318
+ "mean_token_accuracy": 0.757664144039154,
319
+ "num_tokens": 433432.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.2899688124656676,
324
+ "epoch": 0.3669724770642202,
325
+ "grad_norm": 1.5826157331466675,
326
+ "learning_rate": 7.31651376146789e-05,
327
+ "loss": 0.5805,
328
+ "mean_token_accuracy": 0.7921296834945679,
329
+ "num_tokens": 447592.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.2863509058952332,
334
+ "epoch": 0.37844036697247707,
335
+ "grad_norm": 1.7210900783538818,
336
+ "learning_rate": 7.545871559633027e-05,
337
+ "loss": 0.5926,
338
+ "mean_token_accuracy": 0.7852405548095703,
339
+ "num_tokens": 462489.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.2890722513198853,
344
+ "epoch": 0.38990825688073394,
345
+ "grad_norm": 1.6051267385482788,
346
+ "learning_rate": 7.775229357798165e-05,
347
+ "loss": 0.6173,
348
+ "mean_token_accuracy": 0.7741429924964904,
349
+ "num_tokens": 476591.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.313070333003998,
354
+ "epoch": 0.4013761467889908,
355
+ "grad_norm": 1.7080140113830566,
356
+ "learning_rate": 8.004587155963303e-05,
357
+ "loss": 0.6165,
358
+ "mean_token_accuracy": 0.7842044055461883,
359
+ "num_tokens": 491338.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.2972561955451964,
364
+ "epoch": 0.41284403669724773,
365
+ "grad_norm": 1.7454527616500854,
366
+ "learning_rate": 8.23394495412844e-05,
367
+ "loss": 0.5927,
368
+ "mean_token_accuracy": 0.7841840922832489,
369
+ "num_tokens": 505152.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.2944241881370544,
374
+ "epoch": 0.4243119266055046,
375
+ "grad_norm": 1.8223613500595093,
376
+ "learning_rate": 8.463302752293578e-05,
377
+ "loss": 0.5862,
378
+ "mean_token_accuracy": 0.7846642255783081,
379
+ "num_tokens": 519536.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.2918418169021606,
384
+ "epoch": 0.43577981651376146,
385
+ "grad_norm": 1.323716640472412,
386
+ "learning_rate": 8.692660550458716e-05,
387
+ "loss": 0.5761,
388
+ "mean_token_accuracy": 0.788896131515503,
389
+ "num_tokens": 533610.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.3106001019477844,
394
+ "epoch": 0.44724770642201833,
395
+ "grad_norm": 2.1389827728271484,
396
+ "learning_rate": 8.922018348623854e-05,
397
+ "loss": 0.6442,
398
+ "mean_token_accuracy": 0.7677759766578675,
399
+ "num_tokens": 547213.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.2924273014068604,
404
+ "epoch": 0.45871559633027525,
405
+ "grad_norm": 1.3077127933502197,
406
+ "learning_rate": 9.151376146788991e-05,
407
+ "loss": 0.6044,
408
+ "mean_token_accuracy": 0.7855095267295837,
409
+ "num_tokens": 560707.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.3057442545890807,
414
+ "epoch": 0.4701834862385321,
415
+ "grad_norm": 1.658679723739624,
416
+ "learning_rate": 9.380733944954129e-05,
417
+ "loss": 0.5803,
418
+ "mean_token_accuracy": 0.7926251292228699,
419
+ "num_tokens": 574533.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.3044120788574218,
424
+ "epoch": 0.481651376146789,
425
+ "grad_norm": 1.7965151071548462,
426
+ "learning_rate": 9.610091743119267e-05,
427
+ "loss": 0.5984,
428
+ "mean_token_accuracy": 0.7874112606048584,
429
+ "num_tokens": 587931.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.3121570587158202,
434
+ "epoch": 0.49311926605504586,
435
+ "grad_norm": 1.1833796501159668,
436
+ "learning_rate": 9.839449541284404e-05,
437
+ "loss": 0.6231,
438
+ "mean_token_accuracy": 0.7761680126190186,
439
+ "num_tokens": 602080.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.3229384422302246,
444
+ "epoch": 0.5045871559633027,
445
+ "grad_norm": 1.98506760597229,
446
+ "learning_rate": 9.99999676404826e-05,
447
+ "loss": 0.6223,
448
+ "mean_token_accuracy": 0.774652361869812,
449
+ "num_tokens": 615535.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.2842121720314026,
454
+ "epoch": 0.5160550458715596,
455
+ "grad_norm": 1.8412768840789795,
456
+ "learning_rate": 9.999939236133826e-05,
457
+ "loss": 0.5968,
458
+ "mean_token_accuracy": 0.7840604305267334,
459
+ "num_tokens": 628767.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.3064908266067505,
464
+ "epoch": 0.5275229357798165,
465
+ "grad_norm": 1.7538436651229858,
466
+ "learning_rate": 9.999809799133033e-05,
467
+ "loss": 0.6244,
468
+ "mean_token_accuracy": 0.7701604008674622,
469
+ "num_tokens": 642874.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.3011385202407837,
474
+ "epoch": 0.5389908256880734,
475
+ "grad_norm": 2.0401413440704346,
476
+ "learning_rate": 9.99960845490744e-05,
477
+ "loss": 0.5897,
478
+ "mean_token_accuracy": 0.7876223146915435,
479
+ "num_tokens": 656374.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.3175038933753966,
484
+ "epoch": 0.5504587155963303,
485
+ "grad_norm": 1.5815656185150146,
486
+ "learning_rate": 9.999335206352783e-05,
487
+ "loss": 0.6681,
488
+ "mean_token_accuracy": 0.7586038947105408,
489
+ "num_tokens": 670397.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.3054586052894592,
494
+ "epoch": 0.5619266055045872,
495
+ "grad_norm": 1.7010897397994995,
496
+ "learning_rate": 9.998990057398916e-05,
497
+ "loss": 0.6488,
498
+ "mean_token_accuracy": 0.7646380603313446,
499
+ "num_tokens": 684143.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.2969472885131836,
504
+ "epoch": 0.573394495412844,
505
+ "grad_norm": 2.1294353008270264,
506
+ "learning_rate": 9.998573013009771e-05,
507
+ "loss": 0.6505,
508
+ "mean_token_accuracy": 0.7664439141750335,
509
+ "num_tokens": 697427.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.3074483752250672,
514
+ "epoch": 0.5848623853211009,
515
+ "grad_norm": 2.1683812141418457,
516
+ "learning_rate": 9.998084079183276e-05,
517
+ "loss": 0.5897,
518
+ "mean_token_accuracy": 0.7885696291923523,
519
+ "num_tokens": 711947.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.2956400752067565,
524
+ "epoch": 0.5963302752293578,
525
+ "grad_norm": 1.4167346954345703,
526
+ "learning_rate": 9.997523262951274e-05,
527
+ "loss": 0.6388,
528
+ "mean_token_accuracy": 0.7672183573246002,
529
+ "num_tokens": 726268.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.315368902683258,
534
+ "epoch": 0.6077981651376146,
535
+ "grad_norm": 2.1706671714782715,
536
+ "learning_rate": 9.996890572379418e-05,
537
+ "loss": 0.6844,
538
+ "mean_token_accuracy": 0.7582804381847381,
539
+ "num_tokens": 740230.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.2926068663597108,
544
+ "epoch": 0.6192660550458715,
545
+ "grad_norm": 1.6460140943527222,
546
+ "learning_rate": 9.99618601656706e-05,
547
+ "loss": 0.5693,
548
+ "mean_token_accuracy": 0.795549190044403,
549
+ "num_tokens": 754570.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.2848342299461364,
554
+ "epoch": 0.6307339449541285,
555
+ "grad_norm": 1.7705565690994263,
556
+ "learning_rate": 9.995409605647117e-05,
557
+ "loss": 0.6189,
558
+ "mean_token_accuracy": 0.7828136622905731,
559
+ "num_tokens": 768740.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.3091715574264526,
564
+ "epoch": 0.6422018348623854,
565
+ "grad_norm": 1.7903367280960083,
566
+ "learning_rate": 9.994561350785923e-05,
567
+ "loss": 0.6096,
568
+ "mean_token_accuracy": 0.7809465050697326,
569
+ "num_tokens": 782860.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.3097781181335448,
574
+ "epoch": 0.6536697247706422,
575
+ "grad_norm": 1.6261135339736938,
576
+ "learning_rate": 9.993641264183074e-05,
577
+ "loss": 0.6488,
578
+ "mean_token_accuracy": 0.7686248242855072,
579
+ "num_tokens": 796852.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.2892103433609008,
584
+ "epoch": 0.6651376146788991,
585
+ "grad_norm": 1.530013084411621,
586
+ "learning_rate": 9.992649359071247e-05,
587
+ "loss": 0.6099,
588
+ "mean_token_accuracy": 0.7832099735736847,
589
+ "num_tokens": 810833.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.2781771540641784,
594
+ "epoch": 0.676605504587156,
595
+ "grad_norm": 1.3513305187225342,
596
+ "learning_rate": 9.991585649716014e-05,
597
+ "loss": 0.6059,
598
+ "mean_token_accuracy": 0.7849724233150482,
599
+ "num_tokens": 825129.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 1.289398467540741,
604
+ "epoch": 0.6880733944954128,
605
+ "grad_norm": 1.2714006900787354,
606
+ "learning_rate": 9.990450151415636e-05,
607
+ "loss": 0.6262,
608
+ "mean_token_accuracy": 0.7734242856502533,
609
+ "num_tokens": 839084.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 1.3282314896583558,
614
+ "epoch": 0.6995412844036697,
615
+ "grad_norm": 1.6062265634536743,
616
+ "learning_rate": 9.989242880500837e-05,
617
+ "loss": 0.6804,
618
+ "mean_token_accuracy": 0.7598551273345947,
619
+ "num_tokens": 853275.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 1.279460871219635,
624
+ "epoch": 0.7110091743119266,
625
+ "grad_norm": 1.211531400680542,
626
+ "learning_rate": 9.987963854334581e-05,
627
+ "loss": 0.5422,
628
+ "mean_token_accuracy": 0.8087258577346802,
629
+ "num_tokens": 867001.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 1.3079694390296936,
634
+ "epoch": 0.7224770642201835,
635
+ "grad_norm": 1.9886008501052856,
636
+ "learning_rate": 9.986613091311811e-05,
637
+ "loss": 0.6505,
638
+ "mean_token_accuracy": 0.7643534898757934,
639
+ "num_tokens": 880836.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 1.3083110094070434,
644
+ "epoch": 0.7339449541284404,
645
+ "grad_norm": 1.7378991842269897,
646
+ "learning_rate": 9.98519061085919e-05,
647
+ "loss": 0.6507,
648
+ "mean_token_accuracy": 0.7652741134166717,
649
+ "num_tokens": 894456.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 1.3111968874931335,
654
+ "epoch": 0.7454128440366973,
655
+ "grad_norm": 1.6157206296920776,
656
+ "learning_rate": 9.983696433434821e-05,
657
+ "loss": 0.6009,
658
+ "mean_token_accuracy": 0.7828308165073394,
659
+ "num_tokens": 908581.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 1.3001808285713197,
664
+ "epoch": 0.7568807339449541,
665
+ "grad_norm": 1.7530412673950195,
666
+ "learning_rate": 9.982130580527951e-05,
667
+ "loss": 0.5973,
668
+ "mean_token_accuracy": 0.7872715950012207,
669
+ "num_tokens": 922198.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 1.3001506924629211,
674
+ "epoch": 0.768348623853211,
675
+ "grad_norm": 1.8743090629577637,
676
+ "learning_rate": 9.980493074658665e-05,
677
+ "loss": 0.5991,
678
+ "mean_token_accuracy": 0.7848590850830078,
679
+ "num_tokens": 934965.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 1.3329032421112061,
684
+ "epoch": 0.7798165137614679,
685
+ "grad_norm": 1.646851658821106,
686
+ "learning_rate": 9.978783939377558e-05,
687
+ "loss": 0.646,
688
+ "mean_token_accuracy": 0.76202232837677,
689
+ "num_tokens": 949474.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 1.3042344450950623,
694
+ "epoch": 0.7912844036697247,
695
+ "grad_norm": 1.6828117370605469,
696
+ "learning_rate": 9.9770031992654e-05,
697
+ "loss": 0.5663,
698
+ "mean_token_accuracy": 0.7932763636112213,
699
+ "num_tokens": 963414.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 1.3154001832008362,
704
+ "epoch": 0.8027522935779816,
705
+ "grad_norm": 1.8354583978652954,
706
+ "learning_rate": 9.975150879932784e-05,
707
+ "loss": 0.5994,
708
+ "mean_token_accuracy": 0.7792726159095764,
709
+ "num_tokens": 977203.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 1.307938539981842,
714
+ "epoch": 0.8142201834862385,
715
+ "grad_norm": 1.6509039402008057,
716
+ "learning_rate": 9.97322700801975e-05,
717
+ "loss": 0.5663,
718
+ "mean_token_accuracy": 0.7955432832241058,
719
+ "num_tokens": 990943.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 1.3173952937126159,
724
+ "epoch": 0.8256880733944955,
725
+ "grad_norm": 1.8522167205810547,
726
+ "learning_rate": 9.971231611195407e-05,
727
+ "loss": 0.614,
728
+ "mean_token_accuracy": 0.7815097570419312,
729
+ "num_tokens": 1005001.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 1.340037202835083,
734
+ "epoch": 0.8371559633027523,
735
+ "grad_norm": 1.4919304847717285,
736
+ "learning_rate": 9.969164718157538e-05,
737
+ "loss": 0.6348,
738
+ "mean_token_accuracy": 0.7702794313430786,
739
+ "num_tokens": 1018544.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 1.3305164098739624,
744
+ "epoch": 0.8486238532110092,
745
+ "grad_norm": 1.5445469617843628,
746
+ "learning_rate": 9.967026358632184e-05,
747
+ "loss": 0.6136,
748
+ "mean_token_accuracy": 0.77325798869133,
749
+ "num_tokens": 1032665.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 1.3210863590240478,
754
+ "epoch": 0.8600917431192661,
755
+ "grad_norm": 1.9453340768814087,
756
+ "learning_rate": 9.964816563373212e-05,
757
+ "loss": 0.6514,
758
+ "mean_token_accuracy": 0.7692999839782715,
759
+ "num_tokens": 1047328.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 1.327096664905548,
764
+ "epoch": 0.8715596330275229,
765
+ "grad_norm": 1.8478624820709229,
766
+ "learning_rate": 9.962535364161879e-05,
767
+ "loss": 0.6003,
768
+ "mean_token_accuracy": 0.7799559772014618,
769
+ "num_tokens": 1061305.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 1.3272370457649232,
774
+ "epoch": 0.8830275229357798,
775
+ "grad_norm": 1.9946807622909546,
776
+ "learning_rate": 9.960182793806377e-05,
777
+ "loss": 0.6315,
778
+ "mean_token_accuracy": 0.7699635088443756,
779
+ "num_tokens": 1075123.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 1.3235833764076232,
784
+ "epoch": 0.8944954128440367,
785
+ "grad_norm": 1.500209927558899,
786
+ "learning_rate": 9.957758886141351e-05,
787
+ "loss": 0.6527,
788
+ "mean_token_accuracy": 0.7683537185192109,
789
+ "num_tokens": 1089084.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 1.312354290485382,
794
+ "epoch": 0.9059633027522935,
795
+ "grad_norm": 1.6548733711242676,
796
+ "learning_rate": 9.955263676027427e-05,
797
+ "loss": 0.5927,
798
+ "mean_token_accuracy": 0.7949600100517273,
799
+ "num_tokens": 1103963.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 1.3421159029006957,
804
+ "epoch": 0.9174311926605505,
805
+ "grad_norm": 1.5262596607208252,
806
+ "learning_rate": 9.95269719935069e-05,
807
+ "loss": 0.6553,
808
+ "mean_token_accuracy": 0.7679201364517212,
809
+ "num_tokens": 1117901.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 1.344819176197052,
814
+ "epoch": 0.9288990825688074,
815
+ "grad_norm": 1.42953360080719,
816
+ "learning_rate": 9.950059493022193e-05,
817
+ "loss": 0.6607,
818
+ "mean_token_accuracy": 0.762078708410263,
819
+ "num_tokens": 1132174.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 1.3429975152015685,
824
+ "epoch": 0.9403669724770642,
825
+ "grad_norm": 1.648417592048645,
826
+ "learning_rate": 9.947350594977402e-05,
827
+ "loss": 0.6929,
828
+ "mean_token_accuracy": 0.7437104344367981,
829
+ "num_tokens": 1146769.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 1.3269536972045899,
834
+ "epoch": 0.9518348623853211,
835
+ "grad_norm": 1.802235722541809,
836
+ "learning_rate": 9.944570544175673e-05,
837
+ "loss": 0.6676,
838
+ "mean_token_accuracy": 0.7601192831993103,
839
+ "num_tokens": 1161091.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 1.3191216468811036,
844
+ "epoch": 0.963302752293578,
845
+ "grad_norm": 1.9612555503845215,
846
+ "learning_rate": 9.941719380599672e-05,
847
+ "loss": 0.625,
848
+ "mean_token_accuracy": 0.7729354560375213,
849
+ "num_tokens": 1173905.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 1.3115869045257569,
854
+ "epoch": 0.9747706422018348,
855
+ "grad_norm": 1.2845028638839722,
856
+ "learning_rate": 9.93879714525481e-05,
857
+ "loss": 0.5944,
858
+ "mean_token_accuracy": 0.7839926242828369,
859
+ "num_tokens": 1188063.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 1.3091205954551697,
864
+ "epoch": 0.9862385321100917,
865
+ "grad_norm": 1.8383289575576782,
866
+ "learning_rate": 9.935803880168652e-05,
867
+ "loss": 0.6237,
868
+ "mean_token_accuracy": 0.7753754138946534,
869
+ "num_tokens": 1202695.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 1.2994250178337097,
874
+ "epoch": 0.9977064220183486,
875
+ "grad_norm": 1.571912407875061,
876
+ "learning_rate": 9.932739628390316e-05,
877
+ "loss": 0.6456,
878
+ "mean_token_accuracy": 0.7671150684356689,
879
+ "num_tokens": 1216684.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 1.3076510548591613,
884
+ "epoch": 1.0091743119266054,
885
+ "grad_norm": 1.8406661748886108,
886
+ "learning_rate": 9.929604433989843e-05,
887
+ "loss": 0.6445,
888
+ "mean_token_accuracy": 0.7758039116859436,
889
+ "num_tokens": 1229248.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 1.2624098420143128,
894
+ "epoch": 1.0206422018348624,
895
+ "grad_norm": 1.9808402061462402,
896
+ "learning_rate": 9.926398342057577e-05,
897
+ "loss": 0.492,
898
+ "mean_token_accuracy": 0.8236800074577332,
899
+ "num_tokens": 1243088.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 1.252714467048645,
904
+ "epoch": 1.0321100917431192,
905
+ "grad_norm": 2.2568917274475098,
906
+ "learning_rate": 9.923121398703504e-05,
907
+ "loss": 0.4861,
908
+ "mean_token_accuracy": 0.8282331109046936,
909
+ "num_tokens": 1256681.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 1.2762907862663269,
914
+ "epoch": 1.0435779816513762,
915
+ "grad_norm": 1.7591499090194702,
916
+ "learning_rate": 9.9197736510566e-05,
917
+ "loss": 0.5326,
918
+ "mean_token_accuracy": 0.8061232268810272,
919
+ "num_tokens": 1270563.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 1.2779451608657837,
924
+ "epoch": 1.0550458715596331,
925
+ "grad_norm": 1.7618857622146606,
926
+ "learning_rate": 9.916355147264142e-05,
927
+ "loss": 0.5762,
928
+ "mean_token_accuracy": 0.7888909459114075,
929
+ "num_tokens": 1284789.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 1.3000144004821776,
934
+ "epoch": 1.06651376146789,
935
+ "grad_norm": 1.929226040840149,
936
+ "learning_rate": 9.912865936491026e-05,
937
+ "loss": 0.556,
938
+ "mean_token_accuracy": 0.7985962986946106,
939
+ "num_tokens": 1298314.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 1.2920597314834594,
944
+ "epoch": 1.0779816513761469,
945
+ "grad_norm": 2.1356875896453857,
946
+ "learning_rate": 9.909306068919055e-05,
947
+ "loss": 0.5872,
948
+ "mean_token_accuracy": 0.7914662003517151,
949
+ "num_tokens": 1312524.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 1.3042231440544128,
954
+ "epoch": 1.0894495412844036,
955
+ "grad_norm": 2.148797035217285,
956
+ "learning_rate": 9.905675595746215e-05,
957
+ "loss": 0.5507,
958
+ "mean_token_accuracy": 0.802655827999115,
959
+ "num_tokens": 1326952.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 1.277776312828064,
964
+ "epoch": 1.1009174311926606,
965
+ "grad_norm": 1.6280494928359985,
966
+ "learning_rate": 9.901974569185941e-05,
967
+ "loss": 0.5579,
968
+ "mean_token_accuracy": 0.8001268386840821,
969
+ "num_tokens": 1341302.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 1.2962275981903075,
974
+ "epoch": 1.1123853211009174,
975
+ "grad_norm": 1.8065513372421265,
976
+ "learning_rate": 9.898203042466368e-05,
977
+ "loss": 0.5492,
978
+ "mean_token_accuracy": 0.8058996260166168,
979
+ "num_tokens": 1355689.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 1.2893213629722595,
984
+ "epoch": 1.1238532110091743,
985
+ "grad_norm": 1.864761233329773,
986
+ "learning_rate": 9.894361069829565e-05,
987
+ "loss": 0.5292,
988
+ "mean_token_accuracy": 0.8077204465866089,
989
+ "num_tokens": 1369850.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 1.2918407797813416,
994
+ "epoch": 1.135321100917431,
995
+ "grad_norm": 2.276775598526001,
996
+ "learning_rate": 9.89044870653075e-05,
997
+ "loss": 0.564,
998
+ "mean_token_accuracy": 0.7952383041381836,
999
+ "num_tokens": 1384054.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 1.281248104572296,
1004
+ "epoch": 1.146788990825688,
1005
+ "grad_norm": 2.1157305240631104,
1006
+ "learning_rate": 9.886466008837503e-05,
1007
+ "loss": 0.5706,
1008
+ "mean_token_accuracy": 0.7949798464775085,
1009
+ "num_tokens": 1398492.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 1.2710728526115418,
1014
+ "epoch": 1.158256880733945,
1015
+ "grad_norm": 1.8817031383514404,
1016
+ "learning_rate": 9.882413034028948e-05,
1017
+ "loss": 0.516,
1018
+ "mean_token_accuracy": 0.8137441635131836,
1019
+ "num_tokens": 1412100.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 1.2870657205581666,
1024
+ "epoch": 1.1697247706422018,
1025
+ "grad_norm": 1.7975279092788696,
1026
+ "learning_rate": 9.878289840394938e-05,
1027
+ "loss": 0.5374,
1028
+ "mean_token_accuracy": 0.8032542705535889,
1029
+ "num_tokens": 1425770.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 1.2666459918022155,
1034
+ "epoch": 1.1811926605504588,
1035
+ "grad_norm": 2.47218656539917,
1036
+ "learning_rate": 9.874096487235212e-05,
1037
+ "loss": 0.5158,
1038
+ "mean_token_accuracy": 0.8173266768455505,
1039
+ "num_tokens": 1439309.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 1.3137032628059386,
1044
+ "epoch": 1.1926605504587156,
1045
+ "grad_norm": 1.7813074588775635,
1046
+ "learning_rate": 9.869833034858538e-05,
1047
+ "loss": 0.5324,
1048
+ "mean_token_accuracy": 0.8099446773529053,
1049
+ "num_tokens": 1454541.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 1.2864318251609803,
1054
+ "epoch": 1.2041284403669725,
1055
+ "grad_norm": 1.9276366233825684,
1056
+ "learning_rate": 9.86549954458186e-05,
1057
+ "loss": 0.5554,
1058
+ "mean_token_accuracy": 0.8048118472099304,
1059
+ "num_tokens": 1468346.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 1.2949382424354554,
1064
+ "epoch": 1.2155963302752293,
1065
+ "grad_norm": 1.9171100854873657,
1066
+ "learning_rate": 9.861096078729396e-05,
1067
+ "loss": 0.5857,
1068
+ "mean_token_accuracy": 0.7923648238182068,
1069
+ "num_tokens": 1482839.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 1.2825786828994752,
1074
+ "epoch": 1.2270642201834863,
1075
+ "grad_norm": 1.458295226097107,
1076
+ "learning_rate": 9.85662270063176e-05,
1077
+ "loss": 0.5344,
1078
+ "mean_token_accuracy": 0.8081244885921478,
1079
+ "num_tokens": 1496532.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 1.2934918642044066,
1084
+ "epoch": 1.238532110091743,
1085
+ "grad_norm": 2.2048583030700684,
1086
+ "learning_rate": 9.852079474625035e-05,
1087
+ "loss": 0.5802,
1088
+ "mean_token_accuracy": 0.7943230092525482,
1089
+ "num_tokens": 1510406.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 1.3103590607643127,
1094
+ "epoch": 1.25,
1095
+ "grad_norm": 2.103316307067871,
1096
+ "learning_rate": 9.847466466049868e-05,
1097
+ "loss": 0.5761,
1098
+ "mean_token_accuracy": 0.7919000566005707,
1099
+ "num_tokens": 1524582.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 1.2943686366081237,
1104
+ "epoch": 1.261467889908257,
1105
+ "grad_norm": 1.8935585021972656,
1106
+ "learning_rate": 9.84278374125051e-05,
1107
+ "loss": 0.5668,
1108
+ "mean_token_accuracy": 0.795119684934616,
1109
+ "num_tokens": 1538645.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 1.2833523988723754,
1114
+ "epoch": 1.2729357798165137,
1115
+ "grad_norm": 1.5310587882995605,
1116
+ "learning_rate": 9.838031367573868e-05,
1117
+ "loss": 0.4791,
1118
+ "mean_token_accuracy": 0.8290136575698852,
1119
+ "num_tokens": 1552198.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 1.2810697436332703,
1124
+ "epoch": 1.2844036697247707,
1125
+ "grad_norm": 1.9493242502212524,
1126
+ "learning_rate": 9.833209413368546e-05,
1127
+ "loss": 0.5479,
1128
+ "mean_token_accuracy": 0.7984305679798126,
1129
+ "num_tokens": 1566248.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 1.2971422672271729,
1134
+ "epoch": 1.2958715596330275,
1135
+ "grad_norm": 2.143052816390991,
1136
+ "learning_rate": 9.828317947983851e-05,
1137
+ "loss": 0.5556,
1138
+ "mean_token_accuracy": 0.7962001860141754,
1139
+ "num_tokens": 1579657.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 1.2938915967941285,
1144
+ "epoch": 1.3073394495412844,
1145
+ "grad_norm": 3.074519395828247,
1146
+ "learning_rate": 9.823357041768797e-05,
1147
+ "loss": 0.5808,
1148
+ "mean_token_accuracy": 0.7921633243560791,
1149
+ "num_tokens": 1594362.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 1.3013799428939818,
1154
+ "epoch": 1.3188073394495412,
1155
+ "grad_norm": 2.1249051094055176,
1156
+ "learning_rate": 9.8183267660711e-05,
1157
+ "loss": 0.5679,
1158
+ "mean_token_accuracy": 0.7960763275623322,
1159
+ "num_tokens": 1607995.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 1.2755417585372926,
1164
+ "epoch": 1.3302752293577982,
1165
+ "grad_norm": 1.7334320545196533,
1166
+ "learning_rate": 9.813227193236144e-05,
1167
+ "loss": 0.5211,
1168
+ "mean_token_accuracy": 0.8171180784702301,
1169
+ "num_tokens": 1621183.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 1.300136685371399,
1174
+ "epoch": 1.341743119266055,
1175
+ "grad_norm": 1.604264259338379,
1176
+ "learning_rate": 9.808058396605945e-05,
1177
+ "loss": 0.5622,
1178
+ "mean_token_accuracy": 0.7956745982170105,
1179
+ "num_tokens": 1634961.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 1.2956653475761413,
1184
+ "epoch": 1.353211009174312,
1185
+ "grad_norm": 2.304135322570801,
1186
+ "learning_rate": 9.802820450518095e-05,
1187
+ "loss": 0.5919,
1188
+ "mean_token_accuracy": 0.7799835622310638,
1189
+ "num_tokens": 1648959.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 1.3270721554756164,
1194
+ "epoch": 1.364678899082569,
1195
+ "grad_norm": 2.304185390472412,
1196
+ "learning_rate": 9.797513430304695e-05,
1197
+ "loss": 0.6347,
1198
+ "mean_token_accuracy": 0.7729239940643311,
1199
+ "num_tokens": 1662218.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 1.3200181603431702,
1204
+ "epoch": 1.3761467889908257,
1205
+ "grad_norm": 2.673722743988037,
1206
+ "learning_rate": 9.792137412291265e-05,
1207
+ "loss": 0.6568,
1208
+ "mean_token_accuracy": 0.7654553771018981,
1209
+ "num_tokens": 1675320.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 1.3001809120178223,
1214
+ "epoch": 1.3876146788990826,
1215
+ "grad_norm": 1.8785172700881958,
1216
+ "learning_rate": 9.786692473795654e-05,
1217
+ "loss": 0.5498,
1218
+ "mean_token_accuracy": 0.7971892893314362,
1219
+ "num_tokens": 1688732.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 1.2927094459533692,
1224
+ "epoch": 1.3990825688073394,
1225
+ "grad_norm": 2.299051284790039,
1226
+ "learning_rate": 9.781178693126923e-05,
1227
+ "loss": 0.5317,
1228
+ "mean_token_accuracy": 0.812885046005249,
1229
+ "num_tokens": 1702489.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 1.2940443515777589,
1234
+ "epoch": 1.4105504587155964,
1235
+ "grad_norm": 2.107447385787964,
1236
+ "learning_rate": 9.775596149584226e-05,
1237
+ "loss": 0.5408,
1238
+ "mean_token_accuracy": 0.8026755452156067,
1239
+ "num_tokens": 1717066.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 1.2880491733551025,
1244
+ "epoch": 1.4220183486238533,
1245
+ "grad_norm": 2.120649814605713,
1246
+ "learning_rate": 9.769944923455654e-05,
1247
+ "loss": 0.5122,
1248
+ "mean_token_accuracy": 0.8185527265071869,
1249
+ "num_tokens": 1730503.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 1.2935888648033143,
1254
+ "epoch": 1.43348623853211,
1255
+ "grad_norm": 1.8897229433059692,
1256
+ "learning_rate": 9.764225096017102e-05,
1257
+ "loss": 0.5891,
1258
+ "mean_token_accuracy": 0.7794159233570099,
1259
+ "num_tokens": 1744257.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 1.2713160991668702,
1264
+ "epoch": 1.4449541284403669,
1265
+ "grad_norm": 1.9189554452896118,
1266
+ "learning_rate": 9.758436749531079e-05,
1267
+ "loss": 0.5146,
1268
+ "mean_token_accuracy": 0.818141633272171,
1269
+ "num_tokens": 1758267.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 1.2798304796218871,
1274
+ "epoch": 1.4564220183486238,
1275
+ "grad_norm": 2.2521767616271973,
1276
+ "learning_rate": 9.752579967245538e-05,
1277
+ "loss": 0.5959,
1278
+ "mean_token_accuracy": 0.7902258694171905,
1279
+ "num_tokens": 1771990.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 1.296580719947815,
1284
+ "epoch": 1.4678899082568808,
1285
+ "grad_norm": 1.5478334426879883,
1286
+ "learning_rate": 9.746654833392677e-05,
1287
+ "loss": 0.5636,
1288
+ "mean_token_accuracy": 0.8009288847446442,
1289
+ "num_tokens": 1786045.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 1.2467906951904297,
1294
+ "epoch": 1.4793577981651376,
1295
+ "grad_norm": 1.8531265258789062,
1296
+ "learning_rate": 9.740661433187725e-05,
1297
+ "loss": 0.4514,
1298
+ "mean_token_accuracy": 0.8369600057601929,
1299
+ "num_tokens": 1800019.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 1.2813060760498047,
1304
+ "epoch": 1.4908256880733946,
1305
+ "grad_norm": 2.007786512374878,
1306
+ "learning_rate": 9.734599852827712e-05,
1307
+ "loss": 0.5587,
1308
+ "mean_token_accuracy": 0.8045243263244629,
1309
+ "num_tokens": 1814394.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 1.2923226833343506,
1314
+ "epoch": 1.5022935779816513,
1315
+ "grad_norm": 2.0562584400177,
1316
+ "learning_rate": 9.728470179490244e-05,
1317
+ "loss": 0.563,
1318
+ "mean_token_accuracy": 0.79967080950737,
1319
+ "num_tokens": 1827604.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 1.28248028755188,
1324
+ "epoch": 1.5137614678899083,
1325
+ "grad_norm": 1.8021918535232544,
1326
+ "learning_rate": 9.72227250133223e-05,
1327
+ "loss": 0.5535,
1328
+ "mean_token_accuracy": 0.8028985977172851,
1329
+ "num_tokens": 1841751.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 1.2800176739692688,
1334
+ "epoch": 1.5252293577981653,
1335
+ "grad_norm": 2.0901622772216797,
1336
+ "learning_rate": 9.71600690748863e-05,
1337
+ "loss": 0.5889,
1338
+ "mean_token_accuracy": 0.7968101024627685,
1339
+ "num_tokens": 1856403.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 1.2775539755821228,
1344
+ "epoch": 1.536697247706422,
1345
+ "grad_norm": 1.9024734497070312,
1346
+ "learning_rate": 9.709673488071163e-05,
1347
+ "loss": 0.5529,
1348
+ "mean_token_accuracy": 0.7998219549655914,
1349
+ "num_tokens": 1870952.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 1.3066880822181701,
1354
+ "epoch": 1.5481651376146788,
1355
+ "grad_norm": 2.2026913166046143,
1356
+ "learning_rate": 9.70327233416702e-05,
1357
+ "loss": 0.6146,
1358
+ "mean_token_accuracy": 0.7799036145210266,
1359
+ "num_tokens": 1884850.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 1.2854471683502198,
1364
+ "epoch": 1.5596330275229358,
1365
+ "grad_norm": 1.995058298110962,
1366
+ "learning_rate": 9.696803537837542e-05,
1367
+ "loss": 0.5744,
1368
+ "mean_token_accuracy": 0.7955298364162445,
1369
+ "num_tokens": 1898895.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 1.2856696963310241,
1374
+ "epoch": 1.5711009174311927,
1375
+ "grad_norm": 1.913603663444519,
1376
+ "learning_rate": 9.690267192116908e-05,
1377
+ "loss": 0.525,
1378
+ "mean_token_accuracy": 0.8169679343700409,
1379
+ "num_tokens": 1913026.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 1.3183680534362794,
1384
+ "epoch": 1.5825688073394495,
1385
+ "grad_norm": 2.7248916625976562,
1386
+ "learning_rate": 9.683663391010791e-05,
1387
+ "loss": 0.6482,
1388
+ "mean_token_accuracy": 0.7678777754306794,
1389
+ "num_tokens": 1927053.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 1.298743522167206,
1394
+ "epoch": 1.5940366972477065,
1395
+ "grad_norm": 2.011831521987915,
1396
+ "learning_rate": 9.676992229495004e-05,
1397
+ "loss": 0.577,
1398
+ "mean_token_accuracy": 0.7876397609710694,
1399
+ "num_tokens": 1940596.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 1.294689130783081,
1404
+ "epoch": 1.6055045871559632,
1405
+ "grad_norm": 2.2598249912261963,
1406
+ "learning_rate": 9.670253803514142e-05,
1407
+ "loss": 0.5746,
1408
+ "mean_token_accuracy": 0.7938637971878052,
1409
+ "num_tokens": 1955635.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 1.3118200659751893,
1414
+ "epoch": 1.6169724770642202,
1415
+ "grad_norm": 1.9109872579574585,
1416
+ "learning_rate": 9.66344820998019e-05,
1417
+ "loss": 0.5996,
1418
+ "mean_token_accuracy": 0.7869695067405701,
1419
+ "num_tokens": 1970187.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 1.2969690084457397,
1424
+ "epoch": 1.6284403669724772,
1425
+ "grad_norm": 2.021652936935425,
1426
+ "learning_rate": 9.656575546771144e-05,
1427
+ "loss": 0.5692,
1428
+ "mean_token_accuracy": 0.7921172618865967,
1429
+ "num_tokens": 1983963.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 1.3053216218948365,
1434
+ "epoch": 1.639908256880734,
1435
+ "grad_norm": 2.056626081466675,
1436
+ "learning_rate": 9.649635912729589e-05,
1437
+ "loss": 0.5534,
1438
+ "mean_token_accuracy": 0.7994763553142548,
1439
+ "num_tokens": 1997426.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 1.307614517211914,
1444
+ "epoch": 1.6513761467889907,
1445
+ "grad_norm": 2.0294957160949707,
1446
+ "learning_rate": 9.642629407661288e-05,
1447
+ "loss": 0.6113,
1448
+ "mean_token_accuracy": 0.7812033116817474,
1449
+ "num_tokens": 2011810.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 1.2840725421905517,
1454
+ "epoch": 1.6628440366972477,
1455
+ "grad_norm": 2.376054525375366,
1456
+ "learning_rate": 9.63555613233374e-05,
1457
+ "loss": 0.5333,
1458
+ "mean_token_accuracy": 0.8069488048553467,
1459
+ "num_tokens": 2025702.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 1.2848711609840393,
1464
+ "epoch": 1.6743119266055047,
1465
+ "grad_norm": 2.387098550796509,
1466
+ "learning_rate": 9.628416188474735e-05,
1467
+ "loss": 0.5295,
1468
+ "mean_token_accuracy": 0.8113990724086761,
1469
+ "num_tokens": 2040039.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 1.3038938522338868,
1474
+ "epoch": 1.6857798165137616,
1475
+ "grad_norm": 2.6049790382385254,
1476
+ "learning_rate": 9.621209678770889e-05,
1477
+ "loss": 0.5902,
1478
+ "mean_token_accuracy": 0.7839356422424316,
1479
+ "num_tokens": 2054883.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 1.3001854181289674,
1484
+ "epoch": 1.6972477064220184,
1485
+ "grad_norm": 2.08150577545166,
1486
+ "learning_rate": 9.613936706866168e-05,
1487
+ "loss": 0.5804,
1488
+ "mean_token_accuracy": 0.7912817001342773,
1489
+ "num_tokens": 2068892.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 1.2911452770233154,
1494
+ "epoch": 1.7087155963302751,
1495
+ "grad_norm": 2.2386717796325684,
1496
+ "learning_rate": 9.606597377360396e-05,
1497
+ "loss": 0.5902,
1498
+ "mean_token_accuracy": 0.7858116149902343,
1499
+ "num_tokens": 2083075.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 1.2923203349113463,
1504
+ "epoch": 1.7201834862385321,
1505
+ "grad_norm": 1.9360357522964478,
1506
+ "learning_rate": 9.59919179580775e-05,
1507
+ "loss": 0.5931,
1508
+ "mean_token_accuracy": 0.7880455732345581,
1509
+ "num_tokens": 2097088.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 1.2811247110366821,
1514
+ "epoch": 1.731651376146789,
1515
+ "grad_norm": 2.346832275390625,
1516
+ "learning_rate": 9.591720068715247e-05,
1517
+ "loss": 0.5381,
1518
+ "mean_token_accuracy": 0.8110429465770721,
1519
+ "num_tokens": 2110713.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 1.2997817516326904,
1524
+ "epoch": 1.7431192660550459,
1525
+ "grad_norm": 2.1013338565826416,
1526
+ "learning_rate": 9.584182303541205e-05,
1527
+ "loss": 0.5771,
1528
+ "mean_token_accuracy": 0.7898500382900238,
1529
+ "num_tokens": 2124467.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 1.283075988292694,
1534
+ "epoch": 1.7545871559633026,
1535
+ "grad_norm": 1.718410849571228,
1536
+ "learning_rate": 9.576578608693703e-05,
1537
+ "loss": 0.5545,
1538
+ "mean_token_accuracy": 0.8036096036434174,
1539
+ "num_tokens": 2139017.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 1.2541950225830079,
1544
+ "epoch": 1.7660550458715596,
1545
+ "grad_norm": 2.381345510482788,
1546
+ "learning_rate": 9.568909093529022e-05,
1547
+ "loss": 0.5071,
1548
+ "mean_token_accuracy": 0.8172869801521301,
1549
+ "num_tokens": 2153212.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 1.2600136041641234,
1554
+ "epoch": 1.7775229357798166,
1555
+ "grad_norm": 1.9568657875061035,
1556
+ "learning_rate": 9.561173868350067e-05,
1557
+ "loss": 0.5251,
1558
+ "mean_token_accuracy": 0.8089884519577026,
1559
+ "num_tokens": 2167190.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 1.2688735485076905,
1564
+ "epoch": 1.7889908256880735,
1565
+ "grad_norm": 2.0126872062683105,
1566
+ "learning_rate": 9.553373044404783e-05,
1567
+ "loss": 0.5563,
1568
+ "mean_token_accuracy": 0.8013049483299255,
1569
+ "num_tokens": 2181135.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 1.2632331728935242,
1574
+ "epoch": 1.8004587155963303,
1575
+ "grad_norm": 1.7177560329437256,
1576
+ "learning_rate": 9.54550673388456e-05,
1577
+ "loss": 0.5456,
1578
+ "mean_token_accuracy": 0.8039442837238312,
1579
+ "num_tokens": 2195099.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 1.2656291127204895,
1584
+ "epoch": 1.811926605504587,
1585
+ "grad_norm": 2.6126630306243896,
1586
+ "learning_rate": 9.537575049922613e-05,
1587
+ "loss": 0.5516,
1588
+ "mean_token_accuracy": 0.7961392283439637,
1589
+ "num_tokens": 2209220.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 1.278434193134308,
1594
+ "epoch": 1.823394495412844,
1595
+ "grad_norm": 2.216356039047241,
1596
+ "learning_rate": 9.52957810659236e-05,
1597
+ "loss": 0.548,
1598
+ "mean_token_accuracy": 0.7977044761180878,
1599
+ "num_tokens": 2222873.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 1.285041868686676,
1604
+ "epoch": 1.834862385321101,
1605
+ "grad_norm": 2.2278988361358643,
1606
+ "learning_rate": 9.521516018905771e-05,
1607
+ "loss": 0.5905,
1608
+ "mean_token_accuracy": 0.7802383601665497,
1609
+ "num_tokens": 2237054.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 1.2938857316970824,
1614
+ "epoch": 1.8463302752293578,
1615
+ "grad_norm": 2.0378856658935547,
1616
+ "learning_rate": 9.513388902811733e-05,
1617
+ "loss": 0.6033,
1618
+ "mean_token_accuracy": 0.7891092479228974,
1619
+ "num_tokens": 2250581.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 1.2730875372886659,
1624
+ "epoch": 1.8577981651376145,
1625
+ "grad_norm": 1.9576410055160522,
1626
+ "learning_rate": 9.505196875194362e-05,
1627
+ "loss": 0.5709,
1628
+ "mean_token_accuracy": 0.7948619246482849,
1629
+ "num_tokens": 2264352.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 1.2942588448524475,
1634
+ "epoch": 1.8692660550458715,
1635
+ "grad_norm": 3.2486989498138428,
1636
+ "learning_rate": 9.496940053871333e-05,
1637
+ "loss": 0.5695,
1638
+ "mean_token_accuracy": 0.7931654870510101,
1639
+ "num_tokens": 2278395.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 1.2859179735183717,
1644
+ "epoch": 1.8807339449541285,
1645
+ "grad_norm": 1.7161357402801514,
1646
+ "learning_rate": 9.488618557592187e-05,
1647
+ "loss": 0.5588,
1648
+ "mean_token_accuracy": 0.7988445639610291,
1649
+ "num_tokens": 2292458.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 1.287862777709961,
1654
+ "epoch": 1.8922018348623855,
1655
+ "grad_norm": 1.7279341220855713,
1656
+ "learning_rate": 9.480232506036618e-05,
1657
+ "loss": 0.5718,
1658
+ "mean_token_accuracy": 0.7963582694530487,
1659
+ "num_tokens": 2305950.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 1.2868569016456604,
1664
+ "epoch": 1.9036697247706422,
1665
+ "grad_norm": 1.7532700300216675,
1666
+ "learning_rate": 9.471782019812748e-05,
1667
+ "loss": 0.5739,
1668
+ "mean_token_accuracy": 0.7951330602169037,
1669
+ "num_tokens": 2320092.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 1.2931817889213562,
1674
+ "epoch": 1.915137614678899,
1675
+ "grad_norm": 2.7232377529144287,
1676
+ "learning_rate": 9.463267220455408e-05,
1677
+ "loss": 0.5996,
1678
+ "mean_token_accuracy": 0.7812487840652466,
1679
+ "num_tokens": 2334035.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 1.2847351789474488,
1684
+ "epoch": 1.926605504587156,
1685
+ "grad_norm": 2.1023809909820557,
1686
+ "learning_rate": 9.454688230424372e-05,
1687
+ "loss": 0.5516,
1688
+ "mean_token_accuracy": 0.8027086973190307,
1689
+ "num_tokens": 2348205.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 1.2944233417510986,
1694
+ "epoch": 1.938073394495413,
1695
+ "grad_norm": 2.789158582687378,
1696
+ "learning_rate": 9.446045173102607e-05,
1697
+ "loss": 0.6096,
1698
+ "mean_token_accuracy": 0.7904924273490905,
1699
+ "num_tokens": 2362411.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 1.296918225288391,
1704
+ "epoch": 1.9495412844036697,
1705
+ "grad_norm": 2.8648757934570312,
1706
+ "learning_rate": 9.437338172794495e-05,
1707
+ "loss": 0.5851,
1708
+ "mean_token_accuracy": 0.7826291382312774,
1709
+ "num_tokens": 2376229.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 1.2416040658950807,
1714
+ "epoch": 1.9610091743119265,
1715
+ "grad_norm": 2.146327257156372,
1716
+ "learning_rate": 9.428567354724047e-05,
1717
+ "loss": 0.5003,
1718
+ "mean_token_accuracy": 0.8209156513214111,
1719
+ "num_tokens": 2389870.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 1.299118459224701,
1724
+ "epoch": 1.9724770642201834,
1725
+ "grad_norm": 1.9699536561965942,
1726
+ "learning_rate": 9.419732845033093e-05,
1727
+ "loss": 0.5857,
1728
+ "mean_token_accuracy": 0.7884073138237,
1729
+ "num_tokens": 2403887.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 1.307542335987091,
1734
+ "epoch": 1.9839449541284404,
1735
+ "grad_norm": 2.541121006011963,
1736
+ "learning_rate": 9.410834770779489e-05,
1737
+ "loss": 0.6299,
1738
+ "mean_token_accuracy": 0.7736253619194031,
1739
+ "num_tokens": 2418109.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 1.2949981808662414,
1744
+ "epoch": 1.9954128440366974,
1745
+ "grad_norm": 1.7402102947235107,
1746
+ "learning_rate": 9.401873259935261e-05,
1747
+ "loss": 0.5928,
1748
+ "mean_token_accuracy": 0.7905942320823669,
1749
+ "num_tokens": 2432561.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 1.2499936938285827,
1754
+ "epoch": 2.006880733944954,
1755
+ "grad_norm": 1.9419931173324585,
1756
+ "learning_rate": 9.392848441384791e-05,
1757
+ "loss": 0.4459,
1758
+ "mean_token_accuracy": 0.8404906570911408,
1759
+ "num_tokens": 2445642.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 1.2266974091529845,
1764
+ "epoch": 2.018348623853211,
1765
+ "grad_norm": 2.0387706756591797,
1766
+ "learning_rate": 9.383760444922948e-05,
1767
+ "loss": 0.4638,
1768
+ "mean_token_accuracy": 0.837564754486084,
1769
+ "num_tokens": 2459784.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 1.236153519153595,
1774
+ "epoch": 2.029816513761468,
1775
+ "grad_norm": 3.5168395042419434,
1776
+ "learning_rate": 9.374609401253222e-05,
1777
+ "loss": 0.4331,
1778
+ "mean_token_accuracy": 0.8434469997882843,
1779
+ "num_tokens": 2473618.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 1.2012577056884766,
1784
+ "epoch": 2.041284403669725,
1785
+ "grad_norm": 2.028303384780884,
1786
+ "learning_rate": 9.365395441985854e-05,
1787
+ "loss": 0.4092,
1788
+ "mean_token_accuracy": 0.8543282926082612,
1789
+ "num_tokens": 2487897.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 1.2243779063224793,
1794
+ "epoch": 2.052752293577982,
1795
+ "grad_norm": 2.172203779220581,
1796
+ "learning_rate": 9.35611869963593e-05,
1797
+ "loss": 0.4359,
1798
+ "mean_token_accuracy": 0.8463135242462159,
1799
+ "num_tokens": 2501905.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 1.2188843488693237,
1804
+ "epoch": 2.0642201834862384,
1805
+ "grad_norm": 2.771411657333374,
1806
+ "learning_rate": 9.346779307621485e-05,
1807
+ "loss": 0.4237,
1808
+ "mean_token_accuracy": 0.847892826795578,
1809
+ "num_tokens": 2516177.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 1.2115617513656616,
1814
+ "epoch": 2.0756880733944953,
1815
+ "grad_norm": 3.289663314819336,
1816
+ "learning_rate": 9.33737740026158e-05,
1817
+ "loss": 0.4571,
1818
+ "mean_token_accuracy": 0.8364902794361114,
1819
+ "num_tokens": 2529738.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 1.215058648586273,
1824
+ "epoch": 2.0871559633027523,
1825
+ "grad_norm": 2.3567795753479004,
1826
+ "learning_rate": 9.327913112774375e-05,
1827
+ "loss": 0.3728,
1828
+ "mean_token_accuracy": 0.8703641653060913,
1829
+ "num_tokens": 2544060.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 1.1677044749259948,
1834
+ "epoch": 2.0986238532110093,
1835
+ "grad_norm": 3.139902114868164,
1836
+ "learning_rate": 9.318386581275175e-05,
1837
+ "loss": 0.393,
1838
+ "mean_token_accuracy": 0.8582305371761322,
1839
+ "num_tokens": 2558412.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 1.205257821083069,
1844
+ "epoch": 2.1100917431192663,
1845
+ "grad_norm": 2.8340916633605957,
1846
+ "learning_rate": 9.308797942774481e-05,
1847
+ "loss": 0.4543,
1848
+ "mean_token_accuracy": 0.8368471086025238,
1849
+ "num_tokens": 2571273.0,
1850
+ "step": 1840
1851
+ },
1852
+ {
1853
+ "entropy": 1.2078231811523437,
1854
+ "epoch": 2.121559633027523,
1855
+ "grad_norm": 2.454955577850342,
1856
+ "learning_rate": 9.299147335176018e-05,
1857
+ "loss": 0.4406,
1858
+ "mean_token_accuracy": 0.8422400116920471,
1859
+ "num_tokens": 2585553.0,
1860
+ "step": 1850
1861
+ },
1862
+ {
1863
+ "entropy": 1.192740023136139,
1864
+ "epoch": 2.13302752293578,
1865
+ "grad_norm": 3.1895253658294678,
1866
+ "learning_rate": 9.289434897274742e-05,
1867
+ "loss": 0.3898,
1868
+ "mean_token_accuracy": 0.8690705955028534,
1869
+ "num_tokens": 2599421.0,
1870
+ "step": 1860
1871
+ },
1872
+ {
1873
+ "entropy": 1.2215320587158203,
1874
+ "epoch": 2.1444954128440368,
1875
+ "grad_norm": 2.7647485733032227,
1876
+ "learning_rate": 9.279660768754863e-05,
1877
+ "loss": 0.4149,
1878
+ "mean_token_accuracy": 0.8522852241992951,
1879
+ "num_tokens": 2612894.0,
1880
+ "step": 1870
1881
+ },
1882
+ {
1883
+ "entropy": 1.226225733757019,
1884
+ "epoch": 2.1559633027522938,
1885
+ "grad_norm": 2.4062561988830566,
1886
+ "learning_rate": 9.269825090187818e-05,
1887
+ "loss": 0.4611,
1888
+ "mean_token_accuracy": 0.8352656781673431,
1889
+ "num_tokens": 2626813.0,
1890
+ "step": 1880
1891
+ },
1892
+ {
1893
+ "entropy": 1.2242176175117492,
1894
+ "epoch": 2.1674311926605503,
1895
+ "grad_norm": 3.1008799076080322,
1896
+ "learning_rate": 9.259928003030259e-05,
1897
+ "loss": 0.455,
1898
+ "mean_token_accuracy": 0.8366096138954162,
1899
+ "num_tokens": 2641120.0,
1900
+ "step": 1890
1901
+ },
1902
+ {
1903
+ "entropy": 1.1906989932060241,
1904
+ "epoch": 2.1788990825688073,
1905
+ "grad_norm": 2.4737274646759033,
1906
+ "learning_rate": 9.249969649622012e-05,
1907
+ "loss": 0.4102,
1908
+ "mean_token_accuracy": 0.8577539443969726,
1909
+ "num_tokens": 2655355.0,
1910
+ "step": 1900
1911
+ },
1912
+ {
1913
+ "entropy": 1.2102949023246765,
1914
+ "epoch": 2.1903669724770642,
1915
+ "grad_norm": 2.7570645809173584,
1916
+ "learning_rate": 9.239950173184038e-05,
1917
+ "loss": 0.4653,
1918
+ "mean_token_accuracy": 0.8341022551059722,
1919
+ "num_tokens": 2669546.0,
1920
+ "step": 1910
1921
+ },
1922
+ {
1923
+ "entropy": 1.2431818008422852,
1924
+ "epoch": 2.2018348623853212,
1925
+ "grad_norm": 2.4581117630004883,
1926
+ "learning_rate": 9.229869717816369e-05,
1927
+ "loss": 0.4823,
1928
+ "mean_token_accuracy": 0.8271047711372376,
1929
+ "num_tokens": 2684043.0,
1930
+ "step": 1920
1931
+ },
1932
+ {
1933
+ "entropy": 1.205946135520935,
1934
+ "epoch": 2.213302752293578,
1935
+ "grad_norm": 2.6330184936523438,
1936
+ "learning_rate": 9.219728428496033e-05,
1937
+ "loss": 0.4137,
1938
+ "mean_token_accuracy": 0.8539348840713501,
1939
+ "num_tokens": 2698376.0,
1940
+ "step": 1930
1941
+ },
1942
+ {
1943
+ "entropy": 1.1894460320472717,
1944
+ "epoch": 2.2247706422018347,
1945
+ "grad_norm": 2.838942527770996,
1946
+ "learning_rate": 9.209526451074971e-05,
1947
+ "loss": 0.4026,
1948
+ "mean_token_accuracy": 0.8516385197639466,
1949
+ "num_tokens": 2712547.0,
1950
+ "step": 1940
1951
+ },
1952
+ {
1953
+ "entropy": 1.2164816498756408,
1954
+ "epoch": 2.2362385321100917,
1955
+ "grad_norm": 2.2571582794189453,
1956
+ "learning_rate": 9.199263932277945e-05,
1957
+ "loss": 0.4471,
1958
+ "mean_token_accuracy": 0.8385171294212341,
1959
+ "num_tokens": 2726604.0,
1960
+ "step": 1950
1961
+ },
1962
+ {
1963
+ "entropy": 1.2147113919258117,
1964
+ "epoch": 2.2477064220183487,
1965
+ "grad_norm": 2.4930830001831055,
1966
+ "learning_rate": 9.188941019700413e-05,
1967
+ "loss": 0.4372,
1968
+ "mean_token_accuracy": 0.8448963344097138,
1969
+ "num_tokens": 2740678.0,
1970
+ "step": 1960
1971
+ },
1972
+ {
1973
+ "entropy": 1.210788643360138,
1974
+ "epoch": 2.2591743119266057,
1975
+ "grad_norm": 2.27130126953125,
1976
+ "learning_rate": 9.178557861806427e-05,
1977
+ "loss": 0.4263,
1978
+ "mean_token_accuracy": 0.8476684868335724,
1979
+ "num_tokens": 2755318.0,
1980
+ "step": 1970
1981
+ },
1982
+ {
1983
+ "entropy": 1.2224658489227296,
1984
+ "epoch": 2.270642201834862,
1985
+ "grad_norm": 2.7551164627075195,
1986
+ "learning_rate": 9.168114607926478e-05,
1987
+ "loss": 0.4593,
1988
+ "mean_token_accuracy": 0.8373873710632325,
1989
+ "num_tokens": 2769370.0,
1990
+ "step": 1980
1991
+ },
1992
+ {
1993
+ "entropy": 1.1999244093894958,
1994
+ "epoch": 2.282110091743119,
1995
+ "grad_norm": 2.5176587104797363,
1996
+ "learning_rate": 9.157611408255362e-05,
1997
+ "loss": 0.4034,
1998
+ "mean_token_accuracy": 0.8577793180942536,
1999
+ "num_tokens": 2783032.0,
2000
+ "step": 1990
2001
+ },
2002
+ {
2003
+ "entropy": 1.2038641333580018,
2004
+ "epoch": 2.293577981651376,
2005
+ "grad_norm": 2.9991376399993896,
2006
+ "learning_rate": 9.147048413850013e-05,
2007
+ "loss": 0.4301,
2008
+ "mean_token_accuracy": 0.8453950345516205,
2009
+ "num_tokens": 2796767.0,
2010
+ "step": 2000
2011
+ },
2012
+ {
2013
+ "entropy": 1.1978623747825623,
2014
+ "epoch": 2.305045871559633,
2015
+ "grad_norm": 3.3090035915374756,
2016
+ "learning_rate": 9.136425776627332e-05,
2017
+ "loss": 0.4478,
2018
+ "mean_token_accuracy": 0.8409509658813477,
2019
+ "num_tokens": 2811040.0,
2020
+ "step": 2010
2021
+ },
2022
+ {
2023
+ "entropy": 1.1866759181022644,
2024
+ "epoch": 2.31651376146789,
2025
+ "grad_norm": 2.55146861076355,
2026
+ "learning_rate": 9.125743649362004e-05,
2027
+ "loss": 0.4111,
2028
+ "mean_token_accuracy": 0.8524983108043671,
2029
+ "num_tokens": 2824974.0,
2030
+ "step": 2020
2031
+ },
2032
+ {
2033
+ "entropy": 1.1859935998916626,
2034
+ "epoch": 2.3279816513761467,
2035
+ "grad_norm": 3.0141847133636475,
2036
+ "learning_rate": 9.115002185684298e-05,
2037
+ "loss": 0.3853,
2038
+ "mean_token_accuracy": 0.8653997778892517,
2039
+ "num_tokens": 2839045.0,
2040
+ "step": 2030
2041
+ },
2042
+ {
2043
+ "entropy": 1.200369417667389,
2044
+ "epoch": 2.3394495412844036,
2045
+ "grad_norm": 3.238649845123291,
2046
+ "learning_rate": 9.104201540077857e-05,
2047
+ "loss": 0.4367,
2048
+ "mean_token_accuracy": 0.849113005399704,
2049
+ "num_tokens": 2852630.0,
2050
+ "step": 2040
2051
+ },
2052
+ {
2053
+ "entropy": 1.1820636987686157,
2054
+ "epoch": 2.3509174311926606,
2055
+ "grad_norm": 2.7103121280670166,
2056
+ "learning_rate": 9.093341867877485e-05,
2057
+ "loss": 0.4002,
2058
+ "mean_token_accuracy": 0.8553554117679596,
2059
+ "num_tokens": 2866122.0,
2060
+ "step": 2050
2061
+ },
2062
+ {
2063
+ "entropy": 1.1925541877746582,
2064
+ "epoch": 2.3623853211009176,
2065
+ "grad_norm": 2.6658740043640137,
2066
+ "learning_rate": 9.082423325266898e-05,
2067
+ "loss": 0.3514,
2068
+ "mean_token_accuracy": 0.8793700635433197,
2069
+ "num_tokens": 2879896.0,
2070
+ "step": 2060
2071
+ },
2072
+ {
2073
+ "entropy": 1.2116564989089966,
2074
+ "epoch": 2.373853211009174,
2075
+ "grad_norm": 2.671292304992676,
2076
+ "learning_rate": 9.071446069276487e-05,
2077
+ "loss": 0.4465,
2078
+ "mean_token_accuracy": 0.8398383617401123,
2079
+ "num_tokens": 2893860.0,
2080
+ "step": 2070
2081
+ },
2082
+ {
2083
+ "entropy": 1.188430666923523,
2084
+ "epoch": 2.385321100917431,
2085
+ "grad_norm": 2.8591768741607666,
2086
+ "learning_rate": 9.060410257781067e-05,
2087
+ "loss": 0.4051,
2088
+ "mean_token_accuracy": 0.8556796789169312,
2089
+ "num_tokens": 2907637.0,
2090
+ "step": 2080
2091
+ },
2092
+ {
2093
+ "entropy": 1.2041984677314759,
2094
+ "epoch": 2.396788990825688,
2095
+ "grad_norm": 4.1422858238220215,
2096
+ "learning_rate": 9.049316049497587e-05,
2097
+ "loss": 0.4237,
2098
+ "mean_token_accuracy": 0.8538813769817353,
2099
+ "num_tokens": 2921138.0,
2100
+ "step": 2090
2101
+ },
2102
+ {
2103
+ "entropy": 1.2061491370201112,
2104
+ "epoch": 2.408256880733945,
2105
+ "grad_norm": 3.6644909381866455,
2106
+ "learning_rate": 9.038163603982861e-05,
2107
+ "loss": 0.4718,
2108
+ "mean_token_accuracy": 0.834813779592514,
2109
+ "num_tokens": 2935922.0,
2110
+ "step": 2100
2111
+ },
2112
+ {
2113
+ "entropy": 1.16827290058136,
2114
+ "epoch": 2.419724770642202,
2115
+ "grad_norm": 2.8285586833953857,
2116
+ "learning_rate": 9.026953081631274e-05,
2117
+ "loss": 0.3772,
2118
+ "mean_token_accuracy": 0.862979942560196,
2119
+ "num_tokens": 2950486.0,
2120
+ "step": 2110
2121
+ },
2122
+ {
2123
+ "entropy": 1.1864002346992493,
2124
+ "epoch": 2.4311926605504586,
2125
+ "grad_norm": 2.524240732192993,
2126
+ "learning_rate": 9.015684643672469e-05,
2127
+ "loss": 0.4183,
2128
+ "mean_token_accuracy": 0.8481187999248505,
2129
+ "num_tokens": 2964370.0,
2130
+ "step": 2120
2131
+ },
2132
+ {
2133
+ "entropy": 1.191727840900421,
2134
+ "epoch": 2.4426605504587156,
2135
+ "grad_norm": 2.7000691890716553,
2136
+ "learning_rate": 9.00435845216903e-05,
2137
+ "loss": 0.4099,
2138
+ "mean_token_accuracy": 0.859676867723465,
2139
+ "num_tokens": 2978586.0,
2140
+ "step": 2130
2141
+ },
2142
+ {
2143
+ "entropy": 1.196892774105072,
2144
+ "epoch": 2.4541284403669725,
2145
+ "grad_norm": 2.5411860942840576,
2146
+ "learning_rate": 8.992974670014156e-05,
2147
+ "loss": 0.4231,
2148
+ "mean_token_accuracy": 0.853669410943985,
2149
+ "num_tokens": 2993227.0,
2150
+ "step": 2140
2151
+ },
2152
+ {
2153
+ "entropy": 1.2281125068664551,
2154
+ "epoch": 2.4655963302752295,
2155
+ "grad_norm": 2.984402656555176,
2156
+ "learning_rate": 8.98153346092931e-05,
2157
+ "loss": 0.4914,
2158
+ "mean_token_accuracy": 0.8256430447101593,
2159
+ "num_tokens": 3006905.0,
2160
+ "step": 2150
2161
+ },
2162
+ {
2163
+ "entropy": 1.1872741818428039,
2164
+ "epoch": 2.477064220183486,
2165
+ "grad_norm": 3.6548094749450684,
2166
+ "learning_rate": 8.970034989461869e-05,
2167
+ "loss": 0.4204,
2168
+ "mean_token_accuracy": 0.8485859632492065,
2169
+ "num_tokens": 3020042.0,
2170
+ "step": 2160
2171
+ },
2172
+ {
2173
+ "entropy": 1.210246503353119,
2174
+ "epoch": 2.488532110091743,
2175
+ "grad_norm": 3.1821584701538086,
2176
+ "learning_rate": 8.95847942098276e-05,
2177
+ "loss": 0.4651,
2178
+ "mean_token_accuracy": 0.839034765958786,
2179
+ "num_tokens": 3034121.0,
2180
+ "step": 2170
2181
+ },
2182
+ {
2183
+ "entropy": 1.196474814414978,
2184
+ "epoch": 2.5,
2185
+ "grad_norm": 2.6702466011047363,
2186
+ "learning_rate": 8.946866921684075e-05,
2187
+ "loss": 0.4034,
2188
+ "mean_token_accuracy": 0.853714382648468,
2189
+ "num_tokens": 3048389.0,
2190
+ "step": 2180
2191
+ },
2192
+ {
2193
+ "entropy": 1.1994709730148316,
2194
+ "epoch": 2.511467889908257,
2195
+ "grad_norm": 2.4583847522735596,
2196
+ "learning_rate": 8.935197658576688e-05,
2197
+ "loss": 0.4547,
2198
+ "mean_token_accuracy": 0.835651034116745,
2199
+ "num_tokens": 3062510.0,
2200
+ "step": 2190
2201
+ },
2202
+ {
2203
+ "entropy": 1.1939351201057433,
2204
+ "epoch": 2.522935779816514,
2205
+ "grad_norm": 3.650007724761963,
2206
+ "learning_rate": 8.923471799487848e-05,
2207
+ "loss": 0.4858,
2208
+ "mean_token_accuracy": 0.822267484664917,
2209
+ "num_tokens": 3077006.0,
2210
+ "step": 2200
2211
+ },
2212
+ {
2213
+ "entropy": 1.1909420490264893,
2214
+ "epoch": 2.5344036697247705,
2215
+ "grad_norm": 2.5217745304107666,
2216
+ "learning_rate": 8.911689513058767e-05,
2217
+ "loss": 0.4471,
2218
+ "mean_token_accuracy": 0.8431772708892822,
2219
+ "num_tokens": 3090504.0,
2220
+ "step": 2210
2221
+ },
2222
+ {
2223
+ "entropy": 1.199105679988861,
2224
+ "epoch": 2.5458715596330275,
2225
+ "grad_norm": 3.0316340923309326,
2226
+ "learning_rate": 8.899850968742196e-05,
2227
+ "loss": 0.4777,
2228
+ "mean_token_accuracy": 0.8287393450737,
2229
+ "num_tokens": 3104342.0,
2230
+ "step": 2220
2231
+ },
2232
+ {
2233
+ "entropy": 1.2064455270767211,
2234
+ "epoch": 2.5573394495412844,
2235
+ "grad_norm": 3.974283218383789,
2236
+ "learning_rate": 8.887956336799985e-05,
2237
+ "loss": 0.4891,
2238
+ "mean_token_accuracy": 0.8292845666408539,
2239
+ "num_tokens": 3117829.0,
2240
+ "step": 2230
2241
+ },
2242
+ {
2243
+ "entropy": 1.1817766427993774,
2244
+ "epoch": 2.5688073394495414,
2245
+ "grad_norm": 2.7031972408294678,
2246
+ "learning_rate": 8.876005788300634e-05,
2247
+ "loss": 0.4419,
2248
+ "mean_token_accuracy": 0.8345361471176147,
2249
+ "num_tokens": 3131630.0,
2250
+ "step": 2240
2251
+ },
2252
+ {
2253
+ "entropy": 1.177594244480133,
2254
+ "epoch": 2.580275229357798,
2255
+ "grad_norm": 3.531320571899414,
2256
+ "learning_rate": 8.863999495116839e-05,
2257
+ "loss": 0.3869,
2258
+ "mean_token_accuracy": 0.8643759608268737,
2259
+ "num_tokens": 3145294.0,
2260
+ "step": 2250
2261
+ },
2262
+ {
2263
+ "entropy": 1.1700214266777038,
2264
+ "epoch": 2.591743119266055,
2265
+ "grad_norm": 3.8070144653320312,
2266
+ "learning_rate": 8.851937629923012e-05,
2267
+ "loss": 0.3935,
2268
+ "mean_token_accuracy": 0.8605147182941437,
2269
+ "num_tokens": 3158737.0,
2270
+ "step": 2260
2271
+ },
2272
+ {
2273
+ "entropy": 1.2061798334121705,
2274
+ "epoch": 2.603211009174312,
2275
+ "grad_norm": 2.8425440788269043,
2276
+ "learning_rate": 8.839820366192802e-05,
2277
+ "loss": 0.4542,
2278
+ "mean_token_accuracy": 0.8374004244804383,
2279
+ "num_tokens": 3172967.0,
2280
+ "step": 2270
2281
+ },
2282
+ {
2283
+ "entropy": 1.2195912718772888,
2284
+ "epoch": 2.614678899082569,
2285
+ "grad_norm": 5.081627368927002,
2286
+ "learning_rate": 8.827647878196601e-05,
2287
+ "loss": 0.4892,
2288
+ "mean_token_accuracy": 0.825077348947525,
2289
+ "num_tokens": 3186931.0,
2290
+ "step": 2280
2291
+ },
2292
+ {
2293
+ "entropy": 1.2038422107696534,
2294
+ "epoch": 2.626146788990826,
2295
+ "grad_norm": 2.543203592300415,
2296
+ "learning_rate": 8.815420340999033e-05,
2297
+ "loss": 0.4599,
2298
+ "mean_token_accuracy": 0.8369208991527557,
2299
+ "num_tokens": 3201522.0,
2300
+ "step": 2290
2301
+ },
2302
+ {
2303
+ "entropy": 1.1964881420135498,
2304
+ "epoch": 2.6376146788990824,
2305
+ "grad_norm": 2.4345703125,
2306
+ "learning_rate": 8.803137930456443e-05,
2307
+ "loss": 0.404,
2308
+ "mean_token_accuracy": 0.8523661613464355,
2309
+ "num_tokens": 3216119.0,
2310
+ "step": 2300
2311
+ },
2312
+ {
2313
+ "entropy": 1.1602870345115661,
2314
+ "epoch": 2.6490825688073394,
2315
+ "grad_norm": 3.4392335414886475,
2316
+ "learning_rate": 8.790800823214358e-05,
2317
+ "loss": 0.3756,
2318
+ "mean_token_accuracy": 0.8643016874790191,
2319
+ "num_tokens": 3230219.0,
2320
+ "step": 2310
2321
+ },
2322
+ {
2323
+ "entropy": 1.1903732061386108,
2324
+ "epoch": 2.6605504587155964,
2325
+ "grad_norm": 3.1603729724884033,
2326
+ "learning_rate": 8.77840919670496e-05,
2327
+ "loss": 0.4619,
2328
+ "mean_token_accuracy": 0.8334128022193908,
2329
+ "num_tokens": 3243964.0,
2330
+ "step": 2320
2331
+ },
2332
+ {
2333
+ "entropy": 1.2090648889541626,
2334
+ "epoch": 2.6720183486238533,
2335
+ "grad_norm": 3.2517998218536377,
2336
+ "learning_rate": 8.765963229144523e-05,
2337
+ "loss": 0.425,
2338
+ "mean_token_accuracy": 0.848976331949234,
2339
+ "num_tokens": 3258262.0,
2340
+ "step": 2330
2341
+ },
2342
+ {
2343
+ "entropy": 1.1725857734680176,
2344
+ "epoch": 2.68348623853211,
2345
+ "grad_norm": 2.936473846435547,
2346
+ "learning_rate": 8.753463099530851e-05,
2347
+ "loss": 0.4379,
2348
+ "mean_token_accuracy": 0.8404418647289276,
2349
+ "num_tokens": 3272235.0,
2350
+ "step": 2340
2351
+ },
2352
+ {
2353
+ "entropy": 1.2088525772094727,
2354
+ "epoch": 2.694954128440367,
2355
+ "grad_norm": 2.534141778945923,
2356
+ "learning_rate": 8.74090898764071e-05,
2357
+ "loss": 0.4689,
2358
+ "mean_token_accuracy": 0.8306144773960114,
2359
+ "num_tokens": 3285426.0,
2360
+ "step": 2350
2361
+ },
2362
+ {
2363
+ "entropy": 1.2100205898284913,
2364
+ "epoch": 2.706422018348624,
2365
+ "grad_norm": 3.1279990673065186,
2366
+ "learning_rate": 8.728301074027237e-05,
2367
+ "loss": 0.5061,
2368
+ "mean_token_accuracy": 0.819576495885849,
2369
+ "num_tokens": 3299795.0,
2370
+ "step": 2360
2371
+ },
2372
+ {
2373
+ "entropy": 1.2010661005973815,
2374
+ "epoch": 2.717889908256881,
2375
+ "grad_norm": 2.502993583679199,
2376
+ "learning_rate": 8.715639540017348e-05,
2377
+ "loss": 0.4144,
2378
+ "mean_token_accuracy": 0.8534714758396149,
2379
+ "num_tokens": 3314140.0,
2380
+ "step": 2370
2381
+ },
2382
+ {
2383
+ "entropy": 1.1825575947761535,
2384
+ "epoch": 2.729357798165138,
2385
+ "grad_norm": 3.2115747928619385,
2386
+ "learning_rate": 8.70292456770912e-05,
2387
+ "loss": 0.4104,
2388
+ "mean_token_accuracy": 0.8631702959537506,
2389
+ "num_tokens": 3328477.0,
2390
+ "step": 2380
2391
+ },
2392
+ {
2393
+ "entropy": 1.2106116533279419,
2394
+ "epoch": 2.7408256880733948,
2395
+ "grad_norm": 2.2947702407836914,
2396
+ "learning_rate": 8.690156339969188e-05,
2397
+ "loss": 0.452,
2398
+ "mean_token_accuracy": 0.8402946293354034,
2399
+ "num_tokens": 3342974.0,
2400
+ "step": 2390
2401
+ },
2402
+ {
2403
+ "entropy": 1.2192729353904723,
2404
+ "epoch": 2.7522935779816513,
2405
+ "grad_norm": 4.383372783660889,
2406
+ "learning_rate": 8.677335040430098e-05,
2407
+ "loss": 0.4587,
2408
+ "mean_token_accuracy": 0.8375409781932831,
2409
+ "num_tokens": 3356914.0,
2410
+ "step": 2400
2411
+ },
2412
+ {
2413
+ "entropy": 1.2175085544586182,
2414
+ "epoch": 2.7637614678899083,
2415
+ "grad_norm": 2.4807965755462646,
2416
+ "learning_rate": 8.664460853487682e-05,
2417
+ "loss": 0.4861,
2418
+ "mean_token_accuracy": 0.8293013036251068,
2419
+ "num_tokens": 3371546.0,
2420
+ "step": 2410
2421
+ },
2422
+ {
2423
+ "entropy": 1.1996549844741822,
2424
+ "epoch": 2.7752293577981653,
2425
+ "grad_norm": 2.7994184494018555,
2426
+ "learning_rate": 8.651533964298391e-05,
2427
+ "loss": 0.4615,
2428
+ "mean_token_accuracy": 0.8339998602867127,
2429
+ "num_tokens": 3385821.0,
2430
+ "step": 2420
2431
+ },
2432
+ {
2433
+ "entropy": 1.2070747137069702,
2434
+ "epoch": 2.786697247706422,
2435
+ "grad_norm": 2.6360669136047363,
2436
+ "learning_rate": 8.638554558776645e-05,
2437
+ "loss": 0.4724,
2438
+ "mean_token_accuracy": 0.8334950864315033,
2439
+ "num_tokens": 3399429.0,
2440
+ "step": 2430
2441
+ },
2442
+ {
2443
+ "entropy": 1.207094705104828,
2444
+ "epoch": 2.7981651376146788,
2445
+ "grad_norm": 3.541809320449829,
2446
+ "learning_rate": 8.625522823592149e-05,
2447
+ "loss": 0.4595,
2448
+ "mean_token_accuracy": 0.8313855290412903,
2449
+ "num_tokens": 3413081.0,
2450
+ "step": 2440
2451
+ },
2452
+ {
2453
+ "entropy": 1.165670096874237,
2454
+ "epoch": 2.8096330275229358,
2455
+ "grad_norm": 2.136975049972534,
2456
+ "learning_rate": 8.612438946167216e-05,
2457
+ "loss": 0.3889,
2458
+ "mean_token_accuracy": 0.8570282876491546,
2459
+ "num_tokens": 3426548.0,
2460
+ "step": 2450
2461
+ },
2462
+ {
2463
+ "entropy": 1.1852982997894288,
2464
+ "epoch": 2.8211009174311927,
2465
+ "grad_norm": 2.86348819732666,
2466
+ "learning_rate": 8.599303114674069e-05,
2467
+ "loss": 0.4249,
2468
+ "mean_token_accuracy": 0.8503771364688874,
2469
+ "num_tokens": 3440175.0,
2470
+ "step": 2460
2471
+ },
2472
+ {
2473
+ "entropy": 1.2077379465103149,
2474
+ "epoch": 2.8325688073394497,
2475
+ "grad_norm": 2.5795695781707764,
2476
+ "learning_rate": 8.586115518032127e-05,
2477
+ "loss": 0.4562,
2478
+ "mean_token_accuracy": 0.8366669476032257,
2479
+ "num_tokens": 3454525.0,
2480
+ "step": 2470
2481
+ },
2482
+ {
2483
+ "entropy": 1.1829241394996644,
2484
+ "epoch": 2.8440366972477067,
2485
+ "grad_norm": 2.239647388458252,
2486
+ "learning_rate": 8.572876345905305e-05,
2487
+ "loss": 0.3989,
2488
+ "mean_token_accuracy": 0.8530926644802094,
2489
+ "num_tokens": 3468835.0,
2490
+ "step": 2480
2491
+ },
2492
+ {
2493
+ "entropy": 1.1779277324676514,
2494
+ "epoch": 2.8555045871559632,
2495
+ "grad_norm": 2.7707083225250244,
2496
+ "learning_rate": 8.55958578869927e-05,
2497
+ "loss": 0.4161,
2498
+ "mean_token_accuracy": 0.8491211295127868,
2499
+ "num_tokens": 3482540.0,
2500
+ "step": 2490
2501
+ },
2502
+ {
2503
+ "entropy": 1.2271911025047302,
2504
+ "epoch": 2.86697247706422,
2505
+ "grad_norm": 3.0390429496765137,
2506
+ "learning_rate": 8.546244037558709e-05,
2507
+ "loss": 0.5198,
2508
+ "mean_token_accuracy": 0.8124437749385833,
2509
+ "num_tokens": 3496087.0,
2510
+ "step": 2500
2511
+ },
2512
+ {
2513
+ "entropy": 1.2103841185569764,
2514
+ "epoch": 2.878440366972477,
2515
+ "grad_norm": 2.824759006500244,
2516
+ "learning_rate": 8.532851284364583e-05,
2517
+ "loss": 0.4506,
2518
+ "mean_token_accuracy": 0.8423313438892365,
2519
+ "num_tokens": 3510218.0,
2520
+ "step": 2510
2521
+ },
2522
+ {
2523
+ "entropy": 1.2116459369659425,
2524
+ "epoch": 2.8899082568807337,
2525
+ "grad_norm": 2.6834771633148193,
2526
+ "learning_rate": 8.519407721731358e-05,
2527
+ "loss": 0.4716,
2528
+ "mean_token_accuracy": 0.8302793622016906,
2529
+ "num_tokens": 3524137.0,
2530
+ "step": 2520
2531
+ },
2532
+ {
2533
+ "entropy": 1.1814413189888,
2534
+ "epoch": 2.9013761467889907,
2535
+ "grad_norm": 2.3810577392578125,
2536
+ "learning_rate": 8.505913543004249e-05,
2537
+ "loss": 0.4289,
2538
+ "mean_token_accuracy": 0.8481670498847962,
2539
+ "num_tokens": 3538479.0,
2540
+ "step": 2530
2541
+ },
2542
+ {
2543
+ "entropy": 1.2182748675346375,
2544
+ "epoch": 2.9128440366972477,
2545
+ "grad_norm": 2.6762099266052246,
2546
+ "learning_rate": 8.492368942256426e-05,
2547
+ "loss": 0.4608,
2548
+ "mean_token_accuracy": 0.8374363958835602,
2549
+ "num_tokens": 3551130.0,
2550
+ "step": 2540
2551
+ },
2552
+ {
2553
+ "entropy": 1.1905839920043946,
2554
+ "epoch": 2.9243119266055047,
2555
+ "grad_norm": 2.8791279792785645,
2556
+ "learning_rate": 8.478774114286228e-05,
2557
+ "loss": 0.407,
2558
+ "mean_token_accuracy": 0.8582445919513703,
2559
+ "num_tokens": 3565579.0,
2560
+ "step": 2550
2561
+ },
2562
+ {
2563
+ "entropy": 1.2176141619682312,
2564
+ "epoch": 2.9357798165137616,
2565
+ "grad_norm": 3.3622496128082275,
2566
+ "learning_rate": 8.465129254614364e-05,
2567
+ "loss": 0.4531,
2568
+ "mean_token_accuracy": 0.8369433999061584,
2569
+ "num_tokens": 3579108.0,
2570
+ "step": 2560
2571
+ },
2572
+ {
2573
+ "entropy": 1.2156746864318848,
2574
+ "epoch": 2.9472477064220186,
2575
+ "grad_norm": 2.5315442085266113,
2576
+ "learning_rate": 8.451434559481099e-05,
2577
+ "loss": 0.414,
2578
+ "mean_token_accuracy": 0.8529948651790619,
2579
+ "num_tokens": 3593159.0,
2580
+ "step": 2570
2581
+ },
2582
+ {
2583
+ "entropy": 1.1980236649513245,
2584
+ "epoch": 2.958715596330275,
2585
+ "grad_norm": 3.484841823577881,
2586
+ "learning_rate": 8.437690225843426e-05,
2587
+ "loss": 0.4092,
2588
+ "mean_token_accuracy": 0.8564953148365021,
2589
+ "num_tokens": 3607158.0,
2590
+ "step": 2580
2591
+ },
2592
+ {
2593
+ "entropy": 1.1944978356361389,
2594
+ "epoch": 2.970183486238532,
2595
+ "grad_norm": 2.7124505043029785,
2596
+ "learning_rate": 8.423896451372245e-05,
2597
+ "loss": 0.4692,
2598
+ "mean_token_accuracy": 0.8257829248905182,
2599
+ "num_tokens": 3620742.0,
2600
+ "step": 2590
2601
+ },
2602
+ {
2603
+ "entropy": 1.1990766882896424,
2604
+ "epoch": 2.981651376146789,
2605
+ "grad_norm": 2.3464643955230713,
2606
+ "learning_rate": 8.41005343444951e-05,
2607
+ "loss": 0.4531,
2608
+ "mean_token_accuracy": 0.833730137348175,
2609
+ "num_tokens": 3634908.0,
2610
+ "step": 2600
2611
+ },
2612
+ {
2613
+ "entropy": 1.1942741513252257,
2614
+ "epoch": 2.9931192660550456,
2615
+ "grad_norm": 2.8104026317596436,
2616
+ "learning_rate": 8.396161374165379e-05,
2617
+ "loss": 0.4836,
2618
+ "mean_token_accuracy": 0.8267854213714599,
2619
+ "num_tokens": 3648446.0,
2620
+ "step": 2610
2621
+ }
2622
+ ],
2623
+ "logging_steps": 10,
2624
+ "max_steps": 8720,
2625
+ "num_input_tokens_seen": 0,
2626
+ "num_train_epochs": 10,
2627
+ "save_steps": 500,
2628
+ "stateful_callbacks": {
2629
+ "TrainerControl": {
2630
+ "args": {
2631
+ "should_epoch_stop": false,
2632
+ "should_evaluate": false,
2633
+ "should_log": false,
2634
+ "should_save": true,
2635
+ "should_training_stop": false
2636
+ },
2637
+ "attributes": {}
2638
+ }
2639
+ },
2640
+ "total_flos": 1.5552682098270106e+17,
2641
+ "train_batch_size": 8,
2642
+ "trial_name": null,
2643
+ "trial_params": null
2644
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-2616/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "gate_proj",
39
+ "q_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40c5baddcc79c12d9ba5fea4c312ba84dbb44fb7ed9042e3e2a6d74cb4852642
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dcff46eb1f7b1db33b94473d51718fd5ce505d0f76daf7d95b3eed2319ff9b0
3
+ size 6481
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-3488/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "gate_proj",
39
+ "q_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2180c7767659e7428e28d9a9ccf952ff9277226f7fe3322e822b44b304999421
3
+ size 80792096
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_vt_add_a0.75_B3_ALL_atag_noSys/checkpoint-4360/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896