Ousmanend32 commited on
Commit
7d7b8ee
·
verified ·
1 Parent(s): 7cfb9ec

Upload general knowledge SFT checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-2551/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-1.7B
3
+ library_name: transformers
4
+ model_name: gk_sft_full
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gk_sft_full
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 1.3.0
39
+ - Transformers: 5.7.0
40
+ - Pytorch: 2.10.0+cu128
41
+ - Datasets: 4.8.5
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
57
+ }
58
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-2551/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-2551/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 151643,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": true,
59
+ "transformers_version": "5.7.0",
60
+ "use_cache": false,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 151936
63
+ }
checkpoint-2551/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.6,
9
+ "top_k": 20,
10
+ "top_p": 0.95,
11
+ "transformers_version": "5.7.0"
12
+ }
checkpoint-2551/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77a976b0308720d09d1989a48aee5169ffdcf716f71404832899518b7211c1d
3
+ size 3441185608
checkpoint-2551/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75cf16cf2a3e37985328f27f6832a03a72d37f25b80a0296c458bc9b296d4936
3
+ size 6882572207
checkpoint-2551/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
+ size 14645
checkpoint-2551/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c66f3faa37cf2bb890385165a08aaa2407ad1ea91ebb077248f7a98cab673a
3
+ size 1465
checkpoint-2551/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoint-2551/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
checkpoint-2551/trainer_state.json ADDED
@@ -0,0 +1,2584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2551,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.40051139052957296,
14
+ "epoch": 0.0039204155640497895,
15
+ "grad_norm": 30.875,
16
+ "learning_rate": 7.058823529411766e-07,
17
+ "loss": 1.316241455078125,
18
+ "mean_token_accuracy": 0.7522591009736062,
19
+ "num_tokens": 130734.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.39800101406872274,
24
+ "epoch": 0.007840831128099579,
25
+ "grad_norm": 28.875,
26
+ "learning_rate": 1.4901960784313726e-06,
27
+ "loss": 1.2585247993469237,
28
+ "mean_token_accuracy": 0.7558266721665859,
29
+ "num_tokens": 267547.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 0.4350921854376793,
34
+ "epoch": 0.011761246692149369,
35
+ "grad_norm": 19.25,
36
+ "learning_rate": 2.274509803921569e-06,
37
+ "loss": 1.2427661895751954,
38
+ "mean_token_accuracy": 0.7490861490368843,
39
+ "num_tokens": 399201.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 0.4768921874463558,
44
+ "epoch": 0.015681662256199158,
45
+ "grad_norm": 12.4375,
46
+ "learning_rate": 3.058823529411765e-06,
47
+ "loss": 1.1601675987243651,
48
+ "mean_token_accuracy": 0.7548606485128403,
49
+ "num_tokens": 527876.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 0.5394699670374393,
54
+ "epoch": 0.019602077820248948,
55
+ "grad_norm": 10.125,
56
+ "learning_rate": 3.843137254901962e-06,
57
+ "loss": 1.0834686279296875,
58
+ "mean_token_accuracy": 0.7578018061816693,
59
+ "num_tokens": 658209.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 0.5797790199518204,
64
+ "epoch": 0.023522493384298737,
65
+ "grad_norm": 6.28125,
66
+ "learning_rate": 4.627450980392157e-06,
67
+ "loss": 0.9067214965820313,
68
+ "mean_token_accuracy": 0.7733228243887424,
69
+ "num_tokens": 791743.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.6502218697220087,
74
+ "epoch": 0.027442908948348527,
75
+ "grad_norm": 5.0,
76
+ "learning_rate": 5.411764705882353e-06,
77
+ "loss": 0.8242039680480957,
78
+ "mean_token_accuracy": 0.7773927368223668,
79
+ "num_tokens": 922196.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.6219119645655156,
84
+ "epoch": 0.031363324512398316,
85
+ "grad_norm": 2.953125,
86
+ "learning_rate": 6.19607843137255e-06,
87
+ "loss": 0.7194723129272461,
88
+ "mean_token_accuracy": 0.7994848638772964,
89
+ "num_tokens": 1053869.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.6228452675044537,
94
+ "epoch": 0.035283740076448106,
95
+ "grad_norm": 2.796875,
96
+ "learning_rate": 6.9803921568627454e-06,
97
+ "loss": 0.6950749397277832,
98
+ "mean_token_accuracy": 0.8034804306924344,
99
+ "num_tokens": 1179999.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.6416549410670995,
104
+ "epoch": 0.039204155640497895,
105
+ "grad_norm": 2.890625,
106
+ "learning_rate": 7.764705882352941e-06,
107
+ "loss": 0.6787714958190918,
108
+ "mean_token_accuracy": 0.8052642524242402,
109
+ "num_tokens": 1313342.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.6458045944571496,
114
+ "epoch": 0.043124571204547685,
115
+ "grad_norm": 2.078125,
116
+ "learning_rate": 8.549019607843138e-06,
117
+ "loss": 0.6732481956481934,
118
+ "mean_token_accuracy": 0.803642563521862,
119
+ "num_tokens": 1444756.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 0.634775460511446,
124
+ "epoch": 0.047044986768597474,
125
+ "grad_norm": 1.9375,
126
+ "learning_rate": 9.333333333333334e-06,
127
+ "loss": 0.6695634841918945,
128
+ "mean_token_accuracy": 0.8045761771500111,
129
+ "num_tokens": 1577527.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 0.6200754787772894,
134
+ "epoch": 0.050965402332647264,
135
+ "grad_norm": 1.9765625,
136
+ "learning_rate": 1.011764705882353e-05,
137
+ "loss": 0.6515310764312744,
138
+ "mean_token_accuracy": 0.8107773900032044,
139
+ "num_tokens": 1712989.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 0.6283363737165928,
144
+ "epoch": 0.05488581789669705,
145
+ "grad_norm": 1.859375,
146
+ "learning_rate": 1.0901960784313726e-05,
147
+ "loss": 0.6463139057159424,
148
+ "mean_token_accuracy": 0.8106771603226661,
149
+ "num_tokens": 1842532.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 0.6225632183253765,
154
+ "epoch": 0.058806233460746836,
155
+ "grad_norm": 1.90625,
156
+ "learning_rate": 1.1686274509803922e-05,
157
+ "loss": 0.6356926918029785,
158
+ "mean_token_accuracy": 0.8131030358374118,
159
+ "num_tokens": 1973776.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 0.5999462876468897,
164
+ "epoch": 0.06272664902479663,
165
+ "grad_norm": 1.9375,
166
+ "learning_rate": 1.2470588235294119e-05,
167
+ "loss": 0.6088289737701416,
168
+ "mean_token_accuracy": 0.8192408412694931,
169
+ "num_tokens": 2106467.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 0.6009281285107135,
174
+ "epoch": 0.06664706458884642,
175
+ "grad_norm": 1.96875,
176
+ "learning_rate": 1.3254901960784314e-05,
177
+ "loss": 0.6100459575653077,
178
+ "mean_token_accuracy": 0.8174788504838943,
179
+ "num_tokens": 2243425.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 0.6414772845804692,
184
+ "epoch": 0.07056748015289621,
185
+ "grad_norm": 2.0625,
186
+ "learning_rate": 1.403921568627451e-05,
187
+ "loss": 0.6445958614349365,
188
+ "mean_token_accuracy": 0.8082262150943279,
189
+ "num_tokens": 2376987.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 0.6355918630957603,
194
+ "epoch": 0.074487895716946,
195
+ "grad_norm": 1.921875,
196
+ "learning_rate": 1.4823529411764707e-05,
197
+ "loss": 0.6472668170928955,
198
+ "mean_token_accuracy": 0.8071325562894345,
199
+ "num_tokens": 2510548.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 0.6236784052103758,
204
+ "epoch": 0.07840831128099579,
205
+ "grad_norm": 1.875,
206
+ "learning_rate": 1.5607843137254904e-05,
207
+ "loss": 0.6356116771697998,
208
+ "mean_token_accuracy": 0.8135113798081874,
209
+ "num_tokens": 2643661.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 0.6072400402277708,
214
+ "epoch": 0.08232872684504558,
215
+ "grad_norm": 1.84375,
216
+ "learning_rate": 1.63921568627451e-05,
217
+ "loss": 0.6156791687011719,
218
+ "mean_token_accuracy": 0.8167861931025981,
219
+ "num_tokens": 2774529.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.5829322092235089,
224
+ "epoch": 0.08624914240909537,
225
+ "grad_norm": 1.8203125,
226
+ "learning_rate": 1.7176470588235293e-05,
227
+ "loss": 0.5909051418304443,
228
+ "mean_token_accuracy": 0.8222801245748996,
229
+ "num_tokens": 2908058.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.6099963016808033,
234
+ "epoch": 0.09016955797314516,
235
+ "grad_norm": 1.6875,
236
+ "learning_rate": 1.796078431372549e-05,
237
+ "loss": 0.6119081974029541,
238
+ "mean_token_accuracy": 0.8179662935435772,
239
+ "num_tokens": 3040096.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.6330130327492952,
244
+ "epoch": 0.09408997353719495,
245
+ "grad_norm": 1.96875,
246
+ "learning_rate": 1.8745098039215686e-05,
247
+ "loss": 0.6453900814056397,
248
+ "mean_token_accuracy": 0.8088094264268875,
249
+ "num_tokens": 3171172.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.6220353674143553,
254
+ "epoch": 0.09801038910124474,
255
+ "grad_norm": 1.9609375,
256
+ "learning_rate": 1.9529411764705885e-05,
257
+ "loss": 0.6278114318847656,
258
+ "mean_token_accuracy": 0.8124852173030377,
259
+ "num_tokens": 3304081.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 0.6234287895262242,
264
+ "epoch": 0.10193080466529453,
265
+ "grad_norm": 1.8359375,
266
+ "learning_rate": 1.996515679442509e-05,
267
+ "loss": 0.6276582717895508,
268
+ "mean_token_accuracy": 0.8146361976861953,
269
+ "num_tokens": 3438063.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 0.6037415701895952,
274
+ "epoch": 0.10585122022934432,
275
+ "grad_norm": 1.8125,
276
+ "learning_rate": 1.9878048780487806e-05,
277
+ "loss": 0.6147459506988525,
278
+ "mean_token_accuracy": 0.8160658605396748,
279
+ "num_tokens": 3572684.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 0.6154889557510614,
284
+ "epoch": 0.1097716357933941,
285
+ "grad_norm": 1.828125,
286
+ "learning_rate": 1.9790940766550523e-05,
287
+ "loss": 0.620861291885376,
288
+ "mean_token_accuracy": 0.8154690660536289,
289
+ "num_tokens": 3706498.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 0.5835129704326392,
294
+ "epoch": 0.11369205135744388,
295
+ "grad_norm": 1.828125,
296
+ "learning_rate": 1.970383275261324e-05,
297
+ "loss": 0.5846771240234375,
298
+ "mean_token_accuracy": 0.8243817768990993,
299
+ "num_tokens": 3837023.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 0.5867879837751389,
304
+ "epoch": 0.11761246692149367,
305
+ "grad_norm": 1.828125,
306
+ "learning_rate": 1.961672473867596e-05,
307
+ "loss": 0.5936897277832032,
308
+ "mean_token_accuracy": 0.8227211274206638,
309
+ "num_tokens": 3966412.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 0.6125645771622658,
314
+ "epoch": 0.12153288248554346,
315
+ "grad_norm": 1.6953125,
316
+ "learning_rate": 1.9529616724738677e-05,
317
+ "loss": 0.6198267459869384,
318
+ "mean_token_accuracy": 0.8152951590716839,
319
+ "num_tokens": 4096711.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 0.610775976255536,
324
+ "epoch": 0.12545329804959326,
325
+ "grad_norm": 1.9296875,
326
+ "learning_rate": 1.9442508710801397e-05,
327
+ "loss": 0.6171989917755127,
328
+ "mean_token_accuracy": 0.815869303047657,
329
+ "num_tokens": 4228066.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 0.6040709633380175,
334
+ "epoch": 0.12937371361364305,
335
+ "grad_norm": 1.9609375,
336
+ "learning_rate": 1.9355400696864114e-05,
337
+ "loss": 0.6103555202484131,
338
+ "mean_token_accuracy": 0.8194533243775368,
339
+ "num_tokens": 4357664.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 0.6143804207444191,
344
+ "epoch": 0.13329412917769284,
345
+ "grad_norm": 1.8828125,
346
+ "learning_rate": 1.926829268292683e-05,
347
+ "loss": 0.6221739768981933,
348
+ "mean_token_accuracy": 0.8153439976274968,
349
+ "num_tokens": 4487688.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 0.6031508389860392,
354
+ "epoch": 0.13721454474174263,
355
+ "grad_norm": 1.84375,
356
+ "learning_rate": 1.9181184668989547e-05,
357
+ "loss": 0.6023824214935303,
358
+ "mean_token_accuracy": 0.8190992563962937,
359
+ "num_tokens": 4619172.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 0.6009325047954917,
364
+ "epoch": 0.14113496030579242,
365
+ "grad_norm": 1.8125,
366
+ "learning_rate": 1.9094076655052267e-05,
367
+ "loss": 0.6117077827453613,
368
+ "mean_token_accuracy": 0.8167035676538944,
369
+ "num_tokens": 4749078.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 0.6013668723404407,
374
+ "epoch": 0.1450553758698422,
375
+ "grad_norm": 1.6875,
376
+ "learning_rate": 1.9006968641114984e-05,
377
+ "loss": 0.5986335277557373,
378
+ "mean_token_accuracy": 0.818621464818716,
379
+ "num_tokens": 4881903.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 0.5910887397825718,
384
+ "epoch": 0.148975791433892,
385
+ "grad_norm": 1.7578125,
386
+ "learning_rate": 1.89198606271777e-05,
387
+ "loss": 0.5908828735351562,
388
+ "mean_token_accuracy": 0.8213229507207871,
389
+ "num_tokens": 5015453.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 0.5584426306188106,
394
+ "epoch": 0.1528962069979418,
395
+ "grad_norm": 1.7265625,
396
+ "learning_rate": 1.8832752613240418e-05,
397
+ "loss": 0.5678494930267334,
398
+ "mean_token_accuracy": 0.8277932897210121,
399
+ "num_tokens": 5147133.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 0.5676757667213679,
404
+ "epoch": 0.15681662256199158,
405
+ "grad_norm": 1.6640625,
406
+ "learning_rate": 1.8745644599303138e-05,
407
+ "loss": 0.5685997009277344,
408
+ "mean_token_accuracy": 0.8280104413628578,
409
+ "num_tokens": 5278668.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 0.5922414932399989,
414
+ "epoch": 0.16073703812604137,
415
+ "grad_norm": 1.765625,
416
+ "learning_rate": 1.8658536585365855e-05,
417
+ "loss": 0.5975120067596436,
418
+ "mean_token_accuracy": 0.8212826780974865,
419
+ "num_tokens": 5411982.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 0.6230888426303863,
424
+ "epoch": 0.16465745369009116,
425
+ "grad_norm": 1.84375,
426
+ "learning_rate": 1.8571428571428575e-05,
427
+ "loss": 0.6278081417083741,
428
+ "mean_token_accuracy": 0.8109267845749855,
429
+ "num_tokens": 5546647.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.5828145634382963,
434
+ "epoch": 0.16857786925414095,
435
+ "grad_norm": 1.765625,
436
+ "learning_rate": 1.8484320557491292e-05,
437
+ "loss": 0.5854227542877197,
438
+ "mean_token_accuracy": 0.8228457011282444,
439
+ "num_tokens": 5679428.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.5902278333902359,
444
+ "epoch": 0.17249828481819074,
445
+ "grad_norm": 1.75,
446
+ "learning_rate": 1.839721254355401e-05,
447
+ "loss": 0.5893653869628906,
448
+ "mean_token_accuracy": 0.8216007687151432,
449
+ "num_tokens": 5811806.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.5815000183880329,
454
+ "epoch": 0.17641870038224053,
455
+ "grad_norm": 1.828125,
456
+ "learning_rate": 1.8310104529616726e-05,
457
+ "loss": 0.5896779060363769,
458
+ "mean_token_accuracy": 0.8225190363824367,
459
+ "num_tokens": 5945059.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.6150704674422741,
464
+ "epoch": 0.18033911594629032,
465
+ "grad_norm": 1.828125,
466
+ "learning_rate": 1.8222996515679442e-05,
467
+ "loss": 0.6127326965332032,
468
+ "mean_token_accuracy": 0.8161274991929531,
469
+ "num_tokens": 6075109.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.5994422011077404,
474
+ "epoch": 0.1842595315103401,
475
+ "grad_norm": 1.578125,
476
+ "learning_rate": 1.8135888501742163e-05,
477
+ "loss": 0.6054869174957276,
478
+ "mean_token_accuracy": 0.8187170140445232,
479
+ "num_tokens": 6206390.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.5805528864264489,
484
+ "epoch": 0.1881799470743899,
485
+ "grad_norm": 1.875,
486
+ "learning_rate": 1.804878048780488e-05,
487
+ "loss": 0.5855868339538575,
488
+ "mean_token_accuracy": 0.8244414009153843,
489
+ "num_tokens": 6343008.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.591498001664877,
494
+ "epoch": 0.1921003626384397,
495
+ "grad_norm": 1.7578125,
496
+ "learning_rate": 1.7961672473867596e-05,
497
+ "loss": 0.5893340587615967,
498
+ "mean_token_accuracy": 0.8216389425098896,
499
+ "num_tokens": 6474582.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.626478412002325,
504
+ "epoch": 0.19602077820248948,
505
+ "grad_norm": 1.921875,
506
+ "learning_rate": 1.7874564459930313e-05,
507
+ "loss": 0.6263772487640381,
508
+ "mean_token_accuracy": 0.8120981454849243,
509
+ "num_tokens": 6607442.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.5900808859616518,
514
+ "epoch": 0.19994119376653927,
515
+ "grad_norm": 1.5546875,
516
+ "learning_rate": 1.7787456445993033e-05,
517
+ "loss": 0.6009780883789062,
518
+ "mean_token_accuracy": 0.819120641052723,
519
+ "num_tokens": 6734350.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.600034212693572,
524
+ "epoch": 0.20386160933058906,
525
+ "grad_norm": 1.78125,
526
+ "learning_rate": 1.770034843205575e-05,
527
+ "loss": 0.5948196887969971,
528
+ "mean_token_accuracy": 0.8216341696679592,
529
+ "num_tokens": 6869288.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.5796573795378208,
534
+ "epoch": 0.20778202489463884,
535
+ "grad_norm": 1.9296875,
536
+ "learning_rate": 1.761324041811847e-05,
537
+ "loss": 0.5877546787261962,
538
+ "mean_token_accuracy": 0.8229651033878327,
539
+ "num_tokens": 7002603.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.5981475539505482,
544
+ "epoch": 0.21170244045868863,
545
+ "grad_norm": 1.734375,
546
+ "learning_rate": 1.7526132404181187e-05,
547
+ "loss": 0.5986037254333496,
548
+ "mean_token_accuracy": 0.8196288175880909,
549
+ "num_tokens": 7135386.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 0.5765955623239278,
554
+ "epoch": 0.21562285602273842,
555
+ "grad_norm": 1.6484375,
556
+ "learning_rate": 1.7439024390243904e-05,
557
+ "loss": 0.5820430278778076,
558
+ "mean_token_accuracy": 0.8261640995740891,
559
+ "num_tokens": 7269113.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 0.5835400529205799,
564
+ "epoch": 0.2195432715867882,
565
+ "grad_norm": 1.96875,
566
+ "learning_rate": 1.735191637630662e-05,
567
+ "loss": 0.5847792148590087,
568
+ "mean_token_accuracy": 0.8227986313402653,
569
+ "num_tokens": 7397199.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 0.5926254410296679,
574
+ "epoch": 0.22346368715083798,
575
+ "grad_norm": 1.6640625,
576
+ "learning_rate": 1.7264808362369338e-05,
577
+ "loss": 0.5946429252624512,
578
+ "mean_token_accuracy": 0.8213822573423386,
579
+ "num_tokens": 7528883.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 0.5802333429455757,
584
+ "epoch": 0.22738410271488776,
585
+ "grad_norm": 1.8203125,
586
+ "learning_rate": 1.7177700348432058e-05,
587
+ "loss": 0.5808559417724609,
588
+ "mean_token_accuracy": 0.8252004392445087,
589
+ "num_tokens": 7661460.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 0.5792893666774035,
594
+ "epoch": 0.23130451827893755,
595
+ "grad_norm": 1.78125,
596
+ "learning_rate": 1.7090592334494775e-05,
597
+ "loss": 0.5844778060913086,
598
+ "mean_token_accuracy": 0.8232058539986611,
599
+ "num_tokens": 7792654.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.5875658553093672,
604
+ "epoch": 0.23522493384298734,
605
+ "grad_norm": 1.6328125,
606
+ "learning_rate": 1.700348432055749e-05,
607
+ "loss": 0.5934580802917481,
608
+ "mean_token_accuracy": 0.8228402160108089,
609
+ "num_tokens": 7929712.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.6018010344356298,
614
+ "epoch": 0.23914534940703713,
615
+ "grad_norm": 1.8125,
616
+ "learning_rate": 1.691637630662021e-05,
617
+ "loss": 0.6018301010131836,
618
+ "mean_token_accuracy": 0.8196375787258148,
619
+ "num_tokens": 8062790.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.5656840804964304,
624
+ "epoch": 0.24306576497108692,
625
+ "grad_norm": 1.65625,
626
+ "learning_rate": 1.682926829268293e-05,
627
+ "loss": 0.5681605339050293,
628
+ "mean_token_accuracy": 0.829612398892641,
629
+ "num_tokens": 8195902.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.590726625546813,
634
+ "epoch": 0.2469861805351367,
635
+ "grad_norm": 1.7265625,
636
+ "learning_rate": 1.6742160278745645e-05,
637
+ "loss": 0.5950279712677002,
638
+ "mean_token_accuracy": 0.8214106187224388,
639
+ "num_tokens": 8326346.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.5998687874525785,
644
+ "epoch": 0.25090659609918653,
645
+ "grad_norm": 1.734375,
646
+ "learning_rate": 1.6655052264808366e-05,
647
+ "loss": 0.5983600616455078,
648
+ "mean_token_accuracy": 0.8189966283738613,
649
+ "num_tokens": 8460029.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.5715964876115323,
654
+ "epoch": 0.2548270116632363,
655
+ "grad_norm": 1.8359375,
656
+ "learning_rate": 1.6567944250871082e-05,
657
+ "loss": 0.5704405307769775,
658
+ "mean_token_accuracy": 0.8268976680934429,
659
+ "num_tokens": 8593771.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 0.5904807798564434,
664
+ "epoch": 0.2587474272272861,
665
+ "grad_norm": 1.7265625,
666
+ "learning_rate": 1.64808362369338e-05,
667
+ "loss": 0.5930946350097657,
668
+ "mean_token_accuracy": 0.8214319236576557,
669
+ "num_tokens": 8727138.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 0.587051372230053,
674
+ "epoch": 0.26266784279133587,
675
+ "grad_norm": 1.9296875,
676
+ "learning_rate": 1.6393728222996516e-05,
677
+ "loss": 0.5872443675994873,
678
+ "mean_token_accuracy": 0.8230736367404461,
679
+ "num_tokens": 8859295.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 0.5858452804386616,
684
+ "epoch": 0.2665882583553857,
685
+ "grad_norm": 1.6640625,
686
+ "learning_rate": 1.6306620209059233e-05,
687
+ "loss": 0.5850666522979736,
688
+ "mean_token_accuracy": 0.824101684987545,
689
+ "num_tokens": 8990881.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 0.5871569372713565,
694
+ "epoch": 0.27050867391943545,
695
+ "grad_norm": 1.640625,
696
+ "learning_rate": 1.6219512195121953e-05,
697
+ "loss": 0.594059944152832,
698
+ "mean_token_accuracy": 0.8213366828858852,
699
+ "num_tokens": 9123499.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 0.5705580234527587,
704
+ "epoch": 0.27442908948348527,
705
+ "grad_norm": 1.828125,
706
+ "learning_rate": 1.613240418118467e-05,
707
+ "loss": 0.578093433380127,
708
+ "mean_token_accuracy": 0.8268906474113464,
709
+ "num_tokens": 9257724.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 0.5944200098514557,
714
+ "epoch": 0.27834950504753503,
715
+ "grad_norm": 1.8671875,
716
+ "learning_rate": 1.604529616724739e-05,
717
+ "loss": 0.5930802345275878,
718
+ "mean_token_accuracy": 0.8196114718914032,
719
+ "num_tokens": 9393701.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 0.5565787792205811,
724
+ "epoch": 0.28226992061158485,
725
+ "grad_norm": 1.7265625,
726
+ "learning_rate": 1.5958188153310107e-05,
727
+ "loss": 0.556602954864502,
728
+ "mean_token_accuracy": 0.8305394992232322,
729
+ "num_tokens": 9525747.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 0.5937954898923635,
734
+ "epoch": 0.2861903361756346,
735
+ "grad_norm": 1.78125,
736
+ "learning_rate": 1.5871080139372824e-05,
737
+ "loss": 0.6040976524353028,
738
+ "mean_token_accuracy": 0.819646991044283,
739
+ "num_tokens": 9656394.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 0.5655720047652721,
744
+ "epoch": 0.2901107517396844,
745
+ "grad_norm": 1.515625,
746
+ "learning_rate": 1.578397212543554e-05,
747
+ "loss": 0.5677313804626465,
748
+ "mean_token_accuracy": 0.8287642747163773,
749
+ "num_tokens": 9786846.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 0.5690872885286808,
754
+ "epoch": 0.2940311673037342,
755
+ "grad_norm": 1.59375,
756
+ "learning_rate": 1.569686411149826e-05,
757
+ "loss": 0.577194356918335,
758
+ "mean_token_accuracy": 0.8268992677330971,
759
+ "num_tokens": 9920679.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 0.5751310177147388,
764
+ "epoch": 0.297951582867784,
765
+ "grad_norm": 1.609375,
766
+ "learning_rate": 1.5609756097560978e-05,
767
+ "loss": 0.5761964797973633,
768
+ "mean_token_accuracy": 0.8257504008710385,
769
+ "num_tokens": 10052844.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 0.5573539979755878,
774
+ "epoch": 0.30187199843183377,
775
+ "grad_norm": 1.71875,
776
+ "learning_rate": 1.5522648083623694e-05,
777
+ "loss": 0.5600615978240967,
778
+ "mean_token_accuracy": 0.8300629191100597,
779
+ "num_tokens": 10183471.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 0.5975829754024744,
784
+ "epoch": 0.3057924139958836,
785
+ "grad_norm": 1.703125,
786
+ "learning_rate": 1.543554006968641e-05,
787
+ "loss": 0.5992860317230224,
788
+ "mean_token_accuracy": 0.8206060849130153,
789
+ "num_tokens": 10313392.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 0.5773690041154623,
794
+ "epoch": 0.30971282955993334,
795
+ "grad_norm": 1.75,
796
+ "learning_rate": 1.5348432055749128e-05,
797
+ "loss": 0.5855609893798828,
798
+ "mean_token_accuracy": 0.8234818607568741,
799
+ "num_tokens": 10448285.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 0.5769198387861252,
804
+ "epoch": 0.31363324512398316,
805
+ "grad_norm": 1.6484375,
806
+ "learning_rate": 1.5261324041811848e-05,
807
+ "loss": 0.5776193618774415,
808
+ "mean_token_accuracy": 0.8255582615733147,
809
+ "num_tokens": 10584236.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 0.5881247483193874,
814
+ "epoch": 0.3175536606880329,
815
+ "grad_norm": 1.7734375,
816
+ "learning_rate": 1.5174216027874567e-05,
817
+ "loss": 0.593212080001831,
818
+ "mean_token_accuracy": 0.8212029553949833,
819
+ "num_tokens": 10716890.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 0.5945972129702568,
824
+ "epoch": 0.32147407625208274,
825
+ "grad_norm": 1.6171875,
826
+ "learning_rate": 1.5087108013937284e-05,
827
+ "loss": 0.5877634525299072,
828
+ "mean_token_accuracy": 0.8238732725381851,
829
+ "num_tokens": 10852266.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 0.5811539199203253,
834
+ "epoch": 0.3253944918161325,
835
+ "grad_norm": 1.796875,
836
+ "learning_rate": 1.5000000000000002e-05,
837
+ "loss": 0.5907205104827881,
838
+ "mean_token_accuracy": 0.8204894207417965,
839
+ "num_tokens": 10985262.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 0.5849005732685327,
844
+ "epoch": 0.3293149073801823,
845
+ "grad_norm": 1.5625,
846
+ "learning_rate": 1.4912891986062719e-05,
847
+ "loss": 0.5833800792694092,
848
+ "mean_token_accuracy": 0.8241490855813026,
849
+ "num_tokens": 11112681.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 0.5829326704144477,
854
+ "epoch": 0.3332353229442321,
855
+ "grad_norm": 1.78125,
856
+ "learning_rate": 1.4825783972125436e-05,
857
+ "loss": 0.5943526268005371,
858
+ "mean_token_accuracy": 0.8208344720304013,
859
+ "num_tokens": 11240261.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 0.6181501217186451,
864
+ "epoch": 0.3371557385082819,
865
+ "grad_norm": 1.7109375,
866
+ "learning_rate": 1.4738675958188156e-05,
867
+ "loss": 0.6180446147918701,
868
+ "mean_token_accuracy": 0.8148257359862328,
869
+ "num_tokens": 11370906.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 0.5804374283179641,
874
+ "epoch": 0.34107615407233166,
875
+ "grad_norm": 1.65625,
876
+ "learning_rate": 1.4651567944250873e-05,
877
+ "loss": 0.5808406829833984,
878
+ "mean_token_accuracy": 0.8237656883895397,
879
+ "num_tokens": 11502338.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 0.5756821602582931,
884
+ "epoch": 0.3449965696363815,
885
+ "grad_norm": 1.703125,
886
+ "learning_rate": 1.456445993031359e-05,
887
+ "loss": 0.5798084735870361,
888
+ "mean_token_accuracy": 0.8261652231216431,
889
+ "num_tokens": 11632165.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 0.6068328361958265,
894
+ "epoch": 0.34891698520043124,
895
+ "grad_norm": 1.671875,
896
+ "learning_rate": 1.4477351916376308e-05,
897
+ "loss": 0.6065092563629151,
898
+ "mean_token_accuracy": 0.8196589723229408,
899
+ "num_tokens": 11763511.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 0.5891778867691755,
904
+ "epoch": 0.35283740076448106,
905
+ "grad_norm": 1.609375,
906
+ "learning_rate": 1.4390243902439025e-05,
907
+ "loss": 0.592728567123413,
908
+ "mean_token_accuracy": 0.8230046413838863,
909
+ "num_tokens": 11895820.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 0.6052472297102213,
914
+ "epoch": 0.3567578163285308,
915
+ "grad_norm": 1.7265625,
916
+ "learning_rate": 1.4303135888501742e-05,
917
+ "loss": 0.6157879829406738,
918
+ "mean_token_accuracy": 0.8158482946455479,
919
+ "num_tokens": 12025859.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 0.5880816575139761,
924
+ "epoch": 0.36067823189258064,
925
+ "grad_norm": 1.828125,
926
+ "learning_rate": 1.4216027874564462e-05,
927
+ "loss": 0.5871949672698975,
928
+ "mean_token_accuracy": 0.8238230250775814,
929
+ "num_tokens": 12154614.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 0.5858425028622151,
934
+ "epoch": 0.3645986474566304,
935
+ "grad_norm": 1.65625,
936
+ "learning_rate": 1.4128919860627179e-05,
937
+ "loss": 0.5850693225860596,
938
+ "mean_token_accuracy": 0.8225606501102447,
939
+ "num_tokens": 12289271.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 0.6006566017866135,
944
+ "epoch": 0.3685190630206802,
945
+ "grad_norm": 1.765625,
946
+ "learning_rate": 1.4041811846689897e-05,
947
+ "loss": 0.6035278797149658,
948
+ "mean_token_accuracy": 0.8171994499862194,
949
+ "num_tokens": 12419706.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 0.5940211407840252,
954
+ "epoch": 0.37243947858473,
955
+ "grad_norm": 1.9375,
956
+ "learning_rate": 1.3954703832752614e-05,
957
+ "loss": 0.5872697830200195,
958
+ "mean_token_accuracy": 0.8215577825903893,
959
+ "num_tokens": 12551888.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 0.5769729625433684,
964
+ "epoch": 0.3763598941487798,
965
+ "grad_norm": 1.78125,
966
+ "learning_rate": 1.3867595818815331e-05,
967
+ "loss": 0.5793991088867188,
968
+ "mean_token_accuracy": 0.8250952236354351,
969
+ "num_tokens": 12685739.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 0.5762162335216999,
974
+ "epoch": 0.38028030971282956,
975
+ "grad_norm": 1.640625,
976
+ "learning_rate": 1.378048780487805e-05,
977
+ "loss": 0.5773720741271973,
978
+ "mean_token_accuracy": 0.8255156740546227,
979
+ "num_tokens": 12817700.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 0.5669316282495857,
984
+ "epoch": 0.3842007252768794,
985
+ "grad_norm": 1.78125,
986
+ "learning_rate": 1.3693379790940768e-05,
987
+ "loss": 0.5728450775146484,
988
+ "mean_token_accuracy": 0.8277040965855121,
989
+ "num_tokens": 12949817.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 0.5594307694584131,
994
+ "epoch": 0.38812114084092914,
995
+ "grad_norm": 1.796875,
996
+ "learning_rate": 1.3606271777003486e-05,
997
+ "loss": 0.5640110015869141,
998
+ "mean_token_accuracy": 0.8282136105000972,
999
+ "num_tokens": 13081117.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 0.5791688058525324,
1004
+ "epoch": 0.39204155640497895,
1005
+ "grad_norm": 1.7578125,
1006
+ "learning_rate": 1.3519163763066203e-05,
1007
+ "loss": 0.5831320285797119,
1008
+ "mean_token_accuracy": 0.8250723823904991,
1009
+ "num_tokens": 13210883.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 0.5651333026587964,
1014
+ "epoch": 0.3959619719690287,
1015
+ "grad_norm": 1.5859375,
1016
+ "learning_rate": 1.343205574912892e-05,
1017
+ "loss": 0.5608913898468018,
1018
+ "mean_token_accuracy": 0.8293542221188546,
1019
+ "num_tokens": 13341155.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 0.5587001539766788,
1024
+ "epoch": 0.39988238753307853,
1025
+ "grad_norm": 1.59375,
1026
+ "learning_rate": 1.3344947735191639e-05,
1027
+ "loss": 0.5611968040466309,
1028
+ "mean_token_accuracy": 0.8305698707699776,
1029
+ "num_tokens": 13474505.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 0.5640604499727487,
1034
+ "epoch": 0.4038028030971283,
1035
+ "grad_norm": 1.796875,
1036
+ "learning_rate": 1.3257839721254357e-05,
1037
+ "loss": 0.565390157699585,
1038
+ "mean_token_accuracy": 0.8282499298453331,
1039
+ "num_tokens": 13607139.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 0.5637519735842943,
1044
+ "epoch": 0.4077232186611781,
1045
+ "grad_norm": 1.6484375,
1046
+ "learning_rate": 1.3170731707317076e-05,
1047
+ "loss": 0.5695384502410888,
1048
+ "mean_token_accuracy": 0.8287955388426781,
1049
+ "num_tokens": 13738326.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 0.580963845923543,
1054
+ "epoch": 0.4116436342252279,
1055
+ "grad_norm": 1.765625,
1056
+ "learning_rate": 1.3083623693379792e-05,
1057
+ "loss": 0.5769538402557373,
1058
+ "mean_token_accuracy": 0.8235732421278954,
1059
+ "num_tokens": 13868452.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 0.5678158435970545,
1064
+ "epoch": 0.4155640497892777,
1065
+ "grad_norm": 1.8125,
1066
+ "learning_rate": 1.299651567944251e-05,
1067
+ "loss": 0.5710372924804688,
1068
+ "mean_token_accuracy": 0.8272775359451771,
1069
+ "num_tokens": 14002979.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 0.5668239049613476,
1074
+ "epoch": 0.41948446535332745,
1075
+ "grad_norm": 1.6328125,
1076
+ "learning_rate": 1.2909407665505226e-05,
1077
+ "loss": 0.5655869483947754,
1078
+ "mean_token_accuracy": 0.8278685718774795,
1079
+ "num_tokens": 14138341.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 0.5467250619083643,
1084
+ "epoch": 0.42340488091737727,
1085
+ "grad_norm": 1.75,
1086
+ "learning_rate": 1.2822299651567945e-05,
1087
+ "loss": 0.5479381561279297,
1088
+ "mean_token_accuracy": 0.8330328531563282,
1089
+ "num_tokens": 14266623.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 0.574169309064746,
1094
+ "epoch": 0.42732529648142703,
1095
+ "grad_norm": 1.578125,
1096
+ "learning_rate": 1.2735191637630663e-05,
1097
+ "loss": 0.580225658416748,
1098
+ "mean_token_accuracy": 0.8270016670227051,
1099
+ "num_tokens": 14397016.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 0.5842641271650791,
1104
+ "epoch": 0.43124571204547685,
1105
+ "grad_norm": 1.6171875,
1106
+ "learning_rate": 1.2648083623693382e-05,
1107
+ "loss": 0.5883337020874023,
1108
+ "mean_token_accuracy": 0.8239847645163536,
1109
+ "num_tokens": 14530946.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 0.5795733086764813,
1114
+ "epoch": 0.4351661276095266,
1115
+ "grad_norm": 1.71875,
1116
+ "learning_rate": 1.2560975609756098e-05,
1117
+ "loss": 0.5835409164428711,
1118
+ "mean_token_accuracy": 0.8237806595861912,
1119
+ "num_tokens": 14668493.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 0.5661398351192475,
1124
+ "epoch": 0.4390865431735764,
1125
+ "grad_norm": 1.8203125,
1126
+ "learning_rate": 1.2473867595818815e-05,
1127
+ "loss": 0.5688445091247558,
1128
+ "mean_token_accuracy": 0.8283659070730209,
1129
+ "num_tokens": 14798023.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 0.5716561190783978,
1134
+ "epoch": 0.4430069587376262,
1135
+ "grad_norm": 1.734375,
1136
+ "learning_rate": 1.2386759581881534e-05,
1137
+ "loss": 0.5737256526947021,
1138
+ "mean_token_accuracy": 0.8261031933128834,
1139
+ "num_tokens": 14927026.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 0.5815631907433272,
1144
+ "epoch": 0.44692737430167595,
1145
+ "grad_norm": 1.796875,
1146
+ "learning_rate": 1.2299651567944252e-05,
1147
+ "loss": 0.5841750144958496,
1148
+ "mean_token_accuracy": 0.8246684789657592,
1149
+ "num_tokens": 15059756.0,
1150
+ "step": 1140
1151
+ },
1152
+ {
1153
+ "entropy": 0.5747593600302935,
1154
+ "epoch": 0.45084778986572577,
1155
+ "grad_norm": 1.5546875,
1156
+ "learning_rate": 1.2212543554006971e-05,
1157
+ "loss": 0.5716405391693116,
1158
+ "mean_token_accuracy": 0.8256537966430187,
1159
+ "num_tokens": 15192135.0,
1160
+ "step": 1150
1161
+ },
1162
+ {
1163
+ "entropy": 0.5864708483219147,
1164
+ "epoch": 0.45476820542977553,
1165
+ "grad_norm": 1.875,
1166
+ "learning_rate": 1.2125435540069688e-05,
1167
+ "loss": 0.5913478851318359,
1168
+ "mean_token_accuracy": 0.8215075552463531,
1169
+ "num_tokens": 15326137.0,
1170
+ "step": 1160
1171
+ },
1172
+ {
1173
+ "entropy": 0.5958302663639188,
1174
+ "epoch": 0.45868862099382535,
1175
+ "grad_norm": 1.6796875,
1176
+ "learning_rate": 1.2038327526132404e-05,
1177
+ "loss": 0.5945512294769287,
1178
+ "mean_token_accuracy": 0.8203706957399846,
1179
+ "num_tokens": 15455625.0,
1180
+ "step": 1170
1181
+ },
1182
+ {
1183
+ "entropy": 0.5705973919481039,
1184
+ "epoch": 0.4626090365578751,
1185
+ "grad_norm": 1.8203125,
1186
+ "learning_rate": 1.1951219512195123e-05,
1187
+ "loss": 0.574383544921875,
1188
+ "mean_token_accuracy": 0.8264761112630368,
1189
+ "num_tokens": 15589324.0,
1190
+ "step": 1180
1191
+ },
1192
+ {
1193
+ "entropy": 0.5563835114240646,
1194
+ "epoch": 0.4665294521219249,
1195
+ "grad_norm": 1.640625,
1196
+ "learning_rate": 1.186411149825784e-05,
1197
+ "loss": 0.5601149559020996,
1198
+ "mean_token_accuracy": 0.8305692337453365,
1199
+ "num_tokens": 15725979.0,
1200
+ "step": 1190
1201
+ },
1202
+ {
1203
+ "entropy": 0.5868437562137843,
1204
+ "epoch": 0.4704498676859747,
1205
+ "grad_norm": 1.7109375,
1206
+ "learning_rate": 1.177700348432056e-05,
1207
+ "loss": 0.583247709274292,
1208
+ "mean_token_accuracy": 0.8226650767028332,
1209
+ "num_tokens": 15857159.0,
1210
+ "step": 1200
1211
+ },
1212
+ {
1213
+ "entropy": 0.5893880043178796,
1214
+ "epoch": 0.4743702832500245,
1215
+ "grad_norm": 1.671875,
1216
+ "learning_rate": 1.1689895470383277e-05,
1217
+ "loss": 0.6006449222564697,
1218
+ "mean_token_accuracy": 0.8204010248184204,
1219
+ "num_tokens": 15992142.0,
1220
+ "step": 1210
1221
+ },
1222
+ {
1223
+ "entropy": 0.5774647582322359,
1224
+ "epoch": 0.47829069881407427,
1225
+ "grad_norm": 1.734375,
1226
+ "learning_rate": 1.1602787456445994e-05,
1227
+ "loss": 0.5778764247894287,
1228
+ "mean_token_accuracy": 0.8252091869711876,
1229
+ "num_tokens": 16125065.0,
1230
+ "step": 1220
1231
+ },
1232
+ {
1233
+ "entropy": 0.5712914764881134,
1234
+ "epoch": 0.4822111143781241,
1235
+ "grad_norm": 1.828125,
1236
+ "learning_rate": 1.1515679442508712e-05,
1237
+ "loss": 0.569354248046875,
1238
+ "mean_token_accuracy": 0.8279658198356629,
1239
+ "num_tokens": 16254336.0,
1240
+ "step": 1230
1241
+ },
1242
+ {
1243
+ "entropy": 0.5910291790962219,
1244
+ "epoch": 0.48613152994217385,
1245
+ "grad_norm": 1.65625,
1246
+ "learning_rate": 1.1428571428571429e-05,
1247
+ "loss": 0.5976007461547852,
1248
+ "mean_token_accuracy": 0.8213621146976948,
1249
+ "num_tokens": 16387933.0,
1250
+ "step": 1240
1251
+ },
1252
+ {
1253
+ "entropy": 0.5892755780369043,
1254
+ "epoch": 0.49005194550622366,
1255
+ "grad_norm": 1.890625,
1256
+ "learning_rate": 1.1341463414634146e-05,
1257
+ "loss": 0.5938167572021484,
1258
+ "mean_token_accuracy": 0.8218666344881058,
1259
+ "num_tokens": 16519028.0,
1260
+ "step": 1250
1261
+ },
1262
+ {
1263
+ "entropy": 0.6019801579415798,
1264
+ "epoch": 0.4939723610702734,
1265
+ "grad_norm": 2.109375,
1266
+ "learning_rate": 1.1254355400696866e-05,
1267
+ "loss": 0.601860523223877,
1268
+ "mean_token_accuracy": 0.8193305231630802,
1269
+ "num_tokens": 16648075.0,
1270
+ "step": 1260
1271
+ },
1272
+ {
1273
+ "entropy": 0.577896298468113,
1274
+ "epoch": 0.49789277663432324,
1275
+ "grad_norm": 1.5859375,
1276
+ "learning_rate": 1.1167247386759583e-05,
1277
+ "loss": 0.5790606498718261,
1278
+ "mean_token_accuracy": 0.8256713755428791,
1279
+ "num_tokens": 16784827.0,
1280
+ "step": 1270
1281
+ },
1282
+ {
1283
+ "entropy": 0.5828588411211968,
1284
+ "epoch": 0.5018131921983731,
1285
+ "grad_norm": 1.8671875,
1286
+ "learning_rate": 1.1080139372822301e-05,
1287
+ "loss": 0.5897656440734863,
1288
+ "mean_token_accuracy": 0.8219093471765518,
1289
+ "num_tokens": 16914566.0,
1290
+ "step": 1280
1291
+ },
1292
+ {
1293
+ "entropy": 0.5996304292231798,
1294
+ "epoch": 0.5057336077624228,
1295
+ "grad_norm": 1.625,
1296
+ "learning_rate": 1.0993031358885018e-05,
1297
+ "loss": 0.6056152820587158,
1298
+ "mean_token_accuracy": 0.8185230061411858,
1299
+ "num_tokens": 17047396.0,
1300
+ "step": 1290
1301
+ },
1302
+ {
1303
+ "entropy": 0.5672778323292732,
1304
+ "epoch": 0.5096540233264726,
1305
+ "grad_norm": 1.7109375,
1306
+ "learning_rate": 1.0905923344947735e-05,
1307
+ "loss": 0.5606307983398438,
1308
+ "mean_token_accuracy": 0.8289205953478813,
1309
+ "num_tokens": 17173863.0,
1310
+ "step": 1300
1311
+ },
1312
+ {
1313
+ "entropy": 0.574262236058712,
1314
+ "epoch": 0.5135744388905223,
1315
+ "grad_norm": 1.546875,
1316
+ "learning_rate": 1.0818815331010455e-05,
1317
+ "loss": 0.5738330364227295,
1318
+ "mean_token_accuracy": 0.8259080529212952,
1319
+ "num_tokens": 17305917.0,
1320
+ "step": 1310
1321
+ },
1322
+ {
1323
+ "entropy": 0.5749511100351811,
1324
+ "epoch": 0.5174948544545722,
1325
+ "grad_norm": 1.671875,
1326
+ "learning_rate": 1.0731707317073172e-05,
1327
+ "loss": 0.5709137916564941,
1328
+ "mean_token_accuracy": 0.8254921354353428,
1329
+ "num_tokens": 17436774.0,
1330
+ "step": 1320
1331
+ },
1332
+ {
1333
+ "entropy": 0.595110259950161,
1334
+ "epoch": 0.521415270018622,
1335
+ "grad_norm": 1.6484375,
1336
+ "learning_rate": 1.0644599303135889e-05,
1337
+ "loss": 0.5969788551330566,
1338
+ "mean_token_accuracy": 0.8186600834131241,
1339
+ "num_tokens": 17569314.0,
1340
+ "step": 1330
1341
+ },
1342
+ {
1343
+ "entropy": 0.5713474582880735,
1344
+ "epoch": 0.5253356855826717,
1345
+ "grad_norm": 1.7421875,
1346
+ "learning_rate": 1.0557491289198607e-05,
1347
+ "loss": 0.5697742462158203,
1348
+ "mean_token_accuracy": 0.8268165580928326,
1349
+ "num_tokens": 17703511.0,
1350
+ "step": 1340
1351
+ },
1352
+ {
1353
+ "entropy": 0.5673128481954336,
1354
+ "epoch": 0.5292561011467215,
1355
+ "grad_norm": 1.8828125,
1356
+ "learning_rate": 1.0470383275261324e-05,
1357
+ "loss": 0.5756454944610596,
1358
+ "mean_token_accuracy": 0.8265850961208343,
1359
+ "num_tokens": 17837601.0,
1360
+ "step": 1350
1361
+ },
1362
+ {
1363
+ "entropy": 0.5455819856375456,
1364
+ "epoch": 0.5331765167107714,
1365
+ "grad_norm": 1.5390625,
1366
+ "learning_rate": 1.0383275261324041e-05,
1367
+ "loss": 0.5512715339660644,
1368
+ "mean_token_accuracy": 0.8321440435945988,
1369
+ "num_tokens": 17968783.0,
1370
+ "step": 1360
1371
+ },
1372
+ {
1373
+ "entropy": 0.5659995820373297,
1374
+ "epoch": 0.5370969322748211,
1375
+ "grad_norm": 1.890625,
1376
+ "learning_rate": 1.0296167247386761e-05,
1377
+ "loss": 0.5727234363555909,
1378
+ "mean_token_accuracy": 0.8270003162324429,
1379
+ "num_tokens": 18100467.0,
1380
+ "step": 1370
1381
+ },
1382
+ {
1383
+ "entropy": 0.5558266244828701,
1384
+ "epoch": 0.5410173478388709,
1385
+ "grad_norm": 1.71875,
1386
+ "learning_rate": 1.0209059233449478e-05,
1387
+ "loss": 0.5511263847351074,
1388
+ "mean_token_accuracy": 0.8310491070151329,
1389
+ "num_tokens": 18236883.0,
1390
+ "step": 1380
1391
+ },
1392
+ {
1393
+ "entropy": 0.5848776459693908,
1394
+ "epoch": 0.5449377634029207,
1395
+ "grad_norm": 1.78125,
1396
+ "learning_rate": 1.0121951219512197e-05,
1397
+ "loss": 0.5840703964233398,
1398
+ "mean_token_accuracy": 0.8221600718796254,
1399
+ "num_tokens": 18366506.0,
1400
+ "step": 1390
1401
+ },
1402
+ {
1403
+ "entropy": 0.5772740695625543,
1404
+ "epoch": 0.5488581789669705,
1405
+ "grad_norm": 1.6171875,
1406
+ "learning_rate": 1.0034843205574913e-05,
1407
+ "loss": 0.5818367004394531,
1408
+ "mean_token_accuracy": 0.8251958817243576,
1409
+ "num_tokens": 18499076.0,
1410
+ "step": 1400
1411
+ },
1412
+ {
1413
+ "entropy": 0.5877921745181084,
1414
+ "epoch": 0.5527785945310203,
1415
+ "grad_norm": 1.71875,
1416
+ "learning_rate": 9.947735191637632e-06,
1417
+ "loss": 0.5859257221221924,
1418
+ "mean_token_accuracy": 0.8229536414146423,
1419
+ "num_tokens": 18627838.0,
1420
+ "step": 1410
1421
+ },
1422
+ {
1423
+ "entropy": 0.5713149573653936,
1424
+ "epoch": 0.5566990100950701,
1425
+ "grad_norm": 1.8046875,
1426
+ "learning_rate": 9.860627177700349e-06,
1427
+ "loss": 0.5759402751922608,
1428
+ "mean_token_accuracy": 0.8259151518344879,
1429
+ "num_tokens": 18757141.0,
1430
+ "step": 1420
1431
+ },
1432
+ {
1433
+ "entropy": 0.5910889457911253,
1434
+ "epoch": 0.5606194256591198,
1435
+ "grad_norm": 1.7421875,
1436
+ "learning_rate": 9.773519163763067e-06,
1437
+ "loss": 0.5957850933074951,
1438
+ "mean_token_accuracy": 0.8201118834316731,
1439
+ "num_tokens": 18888358.0,
1440
+ "step": 1430
1441
+ },
1442
+ {
1443
+ "entropy": 0.5661411985754967,
1444
+ "epoch": 0.5645398412231697,
1445
+ "grad_norm": 1.6015625,
1446
+ "learning_rate": 9.686411149825786e-06,
1447
+ "loss": 0.5634143829345704,
1448
+ "mean_token_accuracy": 0.8272467255592346,
1449
+ "num_tokens": 19020119.0,
1450
+ "step": 1440
1451
+ },
1452
+ {
1453
+ "entropy": 0.5582704734057188,
1454
+ "epoch": 0.5684602567872195,
1455
+ "grad_norm": 1.6484375,
1456
+ "learning_rate": 9.599303135888503e-06,
1457
+ "loss": 0.5596834659576416,
1458
+ "mean_token_accuracy": 0.831222715228796,
1459
+ "num_tokens": 19152625.0,
1460
+ "step": 1450
1461
+ },
1462
+ {
1463
+ "entropy": 0.5848473913967609,
1464
+ "epoch": 0.5723806723512692,
1465
+ "grad_norm": 1.5625,
1466
+ "learning_rate": 9.51219512195122e-06,
1467
+ "loss": 0.5914738655090332,
1468
+ "mean_token_accuracy": 0.821570199728012,
1469
+ "num_tokens": 19282646.0,
1470
+ "step": 1460
1471
+ },
1472
+ {
1473
+ "entropy": 0.5610104382038117,
1474
+ "epoch": 0.576301087915319,
1475
+ "grad_norm": 1.625,
1476
+ "learning_rate": 9.425087108013938e-06,
1477
+ "loss": 0.5633234500885009,
1478
+ "mean_token_accuracy": 0.8292761743068695,
1479
+ "num_tokens": 19418710.0,
1480
+ "step": 1470
1481
+ },
1482
+ {
1483
+ "entropy": 0.5768874280154705,
1484
+ "epoch": 0.5802215034793688,
1485
+ "grad_norm": 1.6875,
1486
+ "learning_rate": 9.337979094076656e-06,
1487
+ "loss": 0.5723718166351318,
1488
+ "mean_token_accuracy": 0.827151071280241,
1489
+ "num_tokens": 19548708.0,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "entropy": 0.5698943838477135,
1494
+ "epoch": 0.5841419190434186,
1495
+ "grad_norm": 1.8359375,
1496
+ "learning_rate": 9.250871080139373e-06,
1497
+ "loss": 0.570045804977417,
1498
+ "mean_token_accuracy": 0.8272446699440479,
1499
+ "num_tokens": 19684081.0,
1500
+ "step": 1490
1501
+ },
1502
+ {
1503
+ "entropy": 0.5802167691290379,
1504
+ "epoch": 0.5880623346074684,
1505
+ "grad_norm": 1.65625,
1506
+ "learning_rate": 9.163763066202092e-06,
1507
+ "loss": 0.5790982246398926,
1508
+ "mean_token_accuracy": 0.8250758126378059,
1509
+ "num_tokens": 19817358.0,
1510
+ "step": 1500
1511
+ },
1512
+ {
1513
+ "entropy": 0.5764854367822408,
1514
+ "epoch": 0.5919827501715181,
1515
+ "grad_norm": 1.6640625,
1516
+ "learning_rate": 9.076655052264809e-06,
1517
+ "loss": 0.5897452354431152,
1518
+ "mean_token_accuracy": 0.8235625132918358,
1519
+ "num_tokens": 19947516.0,
1520
+ "step": 1510
1521
+ },
1522
+ {
1523
+ "entropy": 0.568101118132472,
1524
+ "epoch": 0.595903165735568,
1525
+ "grad_norm": 1.7734375,
1526
+ "learning_rate": 8.989547038327527e-06,
1527
+ "loss": 0.5667146205902099,
1528
+ "mean_token_accuracy": 0.8266186200082302,
1529
+ "num_tokens": 20080882.0,
1530
+ "step": 1520
1531
+ },
1532
+ {
1533
+ "entropy": 0.5726208575069904,
1534
+ "epoch": 0.5998235812996178,
1535
+ "grad_norm": 1.6171875,
1536
+ "learning_rate": 8.902439024390244e-06,
1537
+ "loss": 0.5737858772277832,
1538
+ "mean_token_accuracy": 0.8270730949938297,
1539
+ "num_tokens": 20208040.0,
1540
+ "step": 1530
1541
+ },
1542
+ {
1543
+ "entropy": 0.5912529457360506,
1544
+ "epoch": 0.6037439968636675,
1545
+ "grad_norm": 1.671875,
1546
+ "learning_rate": 8.815331010452962e-06,
1547
+ "loss": 0.5949665069580078,
1548
+ "mean_token_accuracy": 0.821034874767065,
1549
+ "num_tokens": 20342523.0,
1550
+ "step": 1540
1551
+ },
1552
+ {
1553
+ "entropy": 0.5770879618823528,
1554
+ "epoch": 0.6076644124277173,
1555
+ "grad_norm": 1.609375,
1556
+ "learning_rate": 8.728222996515681e-06,
1557
+ "loss": 0.5765514373779297,
1558
+ "mean_token_accuracy": 0.8256359219551086,
1559
+ "num_tokens": 20475073.0,
1560
+ "step": 1550
1561
+ },
1562
+ {
1563
+ "entropy": 0.5801304239779711,
1564
+ "epoch": 0.6115848279917672,
1565
+ "grad_norm": 1.8671875,
1566
+ "learning_rate": 8.641114982578398e-06,
1567
+ "loss": 0.5835923194885254,
1568
+ "mean_token_accuracy": 0.8234978429973125,
1569
+ "num_tokens": 20607544.0,
1570
+ "step": 1560
1571
+ },
1572
+ {
1573
+ "entropy": 0.5704087316989899,
1574
+ "epoch": 0.6155052435558169,
1575
+ "grad_norm": 1.8203125,
1576
+ "learning_rate": 8.554006968641115e-06,
1577
+ "loss": 0.5746907234191895,
1578
+ "mean_token_accuracy": 0.826793709397316,
1579
+ "num_tokens": 20740547.0,
1580
+ "step": 1570
1581
+ },
1582
+ {
1583
+ "entropy": 0.5753558877855539,
1584
+ "epoch": 0.6194256591198667,
1585
+ "grad_norm": 1.7890625,
1586
+ "learning_rate": 8.466898954703833e-06,
1587
+ "loss": 0.5788108825683593,
1588
+ "mean_token_accuracy": 0.8253151409327983,
1589
+ "num_tokens": 20872233.0,
1590
+ "step": 1580
1591
+ },
1592
+ {
1593
+ "entropy": 0.58878487162292,
1594
+ "epoch": 0.6233460746839165,
1595
+ "grad_norm": 1.640625,
1596
+ "learning_rate": 8.379790940766552e-06,
1597
+ "loss": 0.5929996490478515,
1598
+ "mean_token_accuracy": 0.8213138461112977,
1599
+ "num_tokens": 21004889.0,
1600
+ "step": 1590
1601
+ },
1602
+ {
1603
+ "entropy": 0.5900854453444481,
1604
+ "epoch": 0.6272664902479663,
1605
+ "grad_norm": 1.8125,
1606
+ "learning_rate": 8.292682926829268e-06,
1607
+ "loss": 0.5887657642364502,
1608
+ "mean_token_accuracy": 0.8229175060987473,
1609
+ "num_tokens": 21136824.0,
1610
+ "step": 1600
1611
+ },
1612
+ {
1613
+ "entropy": 0.5752101615071297,
1614
+ "epoch": 0.6311869058120161,
1615
+ "grad_norm": 1.65625,
1616
+ "learning_rate": 8.205574912891987e-06,
1617
+ "loss": 0.5777340412139893,
1618
+ "mean_token_accuracy": 0.8268123485147953,
1619
+ "num_tokens": 21270819.0,
1620
+ "step": 1610
1621
+ },
1622
+ {
1623
+ "entropy": 0.5686212468892335,
1624
+ "epoch": 0.6351073213760658,
1625
+ "grad_norm": 1.625,
1626
+ "learning_rate": 8.118466898954704e-06,
1627
+ "loss": 0.5700377941131591,
1628
+ "mean_token_accuracy": 0.8275717034935951,
1629
+ "num_tokens": 21403507.0,
1630
+ "step": 1620
1631
+ },
1632
+ {
1633
+ "entropy": 0.588944623246789,
1634
+ "epoch": 0.6390277369401156,
1635
+ "grad_norm": 1.7109375,
1636
+ "learning_rate": 8.031358885017422e-06,
1637
+ "loss": 0.5895246028900146,
1638
+ "mean_token_accuracy": 0.8222635351121426,
1639
+ "num_tokens": 21539521.0,
1640
+ "step": 1630
1641
+ },
1642
+ {
1643
+ "entropy": 0.5585341576486826,
1644
+ "epoch": 0.6429481525041655,
1645
+ "grad_norm": 1.6875,
1646
+ "learning_rate": 7.94425087108014e-06,
1647
+ "loss": 0.5642005443572998,
1648
+ "mean_token_accuracy": 0.8300835333764554,
1649
+ "num_tokens": 21672827.0,
1650
+ "step": 1640
1651
+ },
1652
+ {
1653
+ "entropy": 0.601011986285448,
1654
+ "epoch": 0.6468685680682152,
1655
+ "grad_norm": 1.703125,
1656
+ "learning_rate": 7.857142857142858e-06,
1657
+ "loss": 0.6064223766326904,
1658
+ "mean_token_accuracy": 0.8166609443724155,
1659
+ "num_tokens": 21803538.0,
1660
+ "step": 1650
1661
+ },
1662
+ {
1663
+ "entropy": 0.5758274313062429,
1664
+ "epoch": 0.650788983632265,
1665
+ "grad_norm": 1.5625,
1666
+ "learning_rate": 7.770034843205574e-06,
1667
+ "loss": 0.5752018928527832,
1668
+ "mean_token_accuracy": 0.8261359445750713,
1669
+ "num_tokens": 21937475.0,
1670
+ "step": 1660
1671
+ },
1672
+ {
1673
+ "entropy": 0.5863972809165716,
1674
+ "epoch": 0.6547093991963148,
1675
+ "grad_norm": 1.9140625,
1676
+ "learning_rate": 7.682926829268293e-06,
1677
+ "loss": 0.5884019374847412,
1678
+ "mean_token_accuracy": 0.8222149938344956,
1679
+ "num_tokens": 22076290.0,
1680
+ "step": 1670
1681
+ },
1682
+ {
1683
+ "entropy": 0.5837476711720229,
1684
+ "epoch": 0.6586298147603646,
1685
+ "grad_norm": 1.625,
1686
+ "learning_rate": 7.595818815331011e-06,
1687
+ "loss": 0.5863306522369385,
1688
+ "mean_token_accuracy": 0.8223751485347748,
1689
+ "num_tokens": 22210848.0,
1690
+ "step": 1680
1691
+ },
1692
+ {
1693
+ "entropy": 0.5637796241790056,
1694
+ "epoch": 0.6625502303244144,
1695
+ "grad_norm": 1.6953125,
1696
+ "learning_rate": 7.508710801393729e-06,
1697
+ "loss": 0.5643290996551513,
1698
+ "mean_token_accuracy": 0.8271290838718415,
1699
+ "num_tokens": 22342981.0,
1700
+ "step": 1690
1701
+ },
1702
+ {
1703
+ "entropy": 0.6036053825169801,
1704
+ "epoch": 0.6664706458884642,
1705
+ "grad_norm": 1.7109375,
1706
+ "learning_rate": 7.421602787456447e-06,
1707
+ "loss": 0.6125275135040283,
1708
+ "mean_token_accuracy": 0.8163446709513664,
1709
+ "num_tokens": 22473724.0,
1710
+ "step": 1700
1711
+ },
1712
+ {
1713
+ "entropy": 0.5783885445445776,
1714
+ "epoch": 0.6703910614525139,
1715
+ "grad_norm": 1.609375,
1716
+ "learning_rate": 7.334494773519164e-06,
1717
+ "loss": 0.5844399452209472,
1718
+ "mean_token_accuracy": 0.8247374482452869,
1719
+ "num_tokens": 22603403.0,
1720
+ "step": 1710
1721
+ },
1722
+ {
1723
+ "entropy": 0.5773308865725995,
1724
+ "epoch": 0.6743114770165638,
1725
+ "grad_norm": 1.546875,
1726
+ "learning_rate": 7.247386759581882e-06,
1727
+ "loss": 0.5766704082489014,
1728
+ "mean_token_accuracy": 0.8254517287015914,
1729
+ "num_tokens": 22737181.0,
1730
+ "step": 1720
1731
+ },
1732
+ {
1733
+ "entropy": 0.5680316600948572,
1734
+ "epoch": 0.6782318925806136,
1735
+ "grad_norm": 1.75,
1736
+ "learning_rate": 7.1602787456446e-06,
1737
+ "loss": 0.5672269821166992,
1738
+ "mean_token_accuracy": 0.8279304966330528,
1739
+ "num_tokens": 22869837.0,
1740
+ "step": 1730
1741
+ },
1742
+ {
1743
+ "entropy": 0.5941996563225984,
1744
+ "epoch": 0.6821523081446633,
1745
+ "grad_norm": 1.7578125,
1746
+ "learning_rate": 7.0731707317073175e-06,
1747
+ "loss": 0.5911007404327393,
1748
+ "mean_token_accuracy": 0.8221167460083961,
1749
+ "num_tokens": 23003947.0,
1750
+ "step": 1740
1751
+ },
1752
+ {
1753
+ "entropy": 0.5591667950153351,
1754
+ "epoch": 0.6860727237087131,
1755
+ "grad_norm": 1.71875,
1756
+ "learning_rate": 6.986062717770036e-06,
1757
+ "loss": 0.5607988834381104,
1758
+ "mean_token_accuracy": 0.8297918803989888,
1759
+ "num_tokens": 23134580.0,
1760
+ "step": 1750
1761
+ },
1762
+ {
1763
+ "entropy": 0.5676139583811164,
1764
+ "epoch": 0.689993139272763,
1765
+ "grad_norm": 1.6875,
1766
+ "learning_rate": 6.898954703832753e-06,
1767
+ "loss": 0.5650452613830567,
1768
+ "mean_token_accuracy": 0.8275837257504464,
1769
+ "num_tokens": 23264614.0,
1770
+ "step": 1760
1771
+ },
1772
+ {
1773
+ "entropy": 0.5673956066370011,
1774
+ "epoch": 0.6939135548368127,
1775
+ "grad_norm": 1.7265625,
1776
+ "learning_rate": 6.8118466898954705e-06,
1777
+ "loss": 0.5670190334320069,
1778
+ "mean_token_accuracy": 0.8293615274131299,
1779
+ "num_tokens": 23396741.0,
1780
+ "step": 1770
1781
+ },
1782
+ {
1783
+ "entropy": 0.5742564305663109,
1784
+ "epoch": 0.6978339704008625,
1785
+ "grad_norm": 1.6875,
1786
+ "learning_rate": 6.724738675958189e-06,
1787
+ "loss": 0.5740202903747559,
1788
+ "mean_token_accuracy": 0.8251322150230408,
1789
+ "num_tokens": 23527918.0,
1790
+ "step": 1780
1791
+ },
1792
+ {
1793
+ "entropy": 0.5919697143137455,
1794
+ "epoch": 0.7017543859649122,
1795
+ "grad_norm": 1.8046875,
1796
+ "learning_rate": 6.637630662020907e-06,
1797
+ "loss": 0.5907740116119384,
1798
+ "mean_token_accuracy": 0.8220984056591988,
1799
+ "num_tokens": 23657724.0,
1800
+ "step": 1790
1801
+ },
1802
+ {
1803
+ "entropy": 0.5903078105300665,
1804
+ "epoch": 0.7056748015289621,
1805
+ "grad_norm": 1.7109375,
1806
+ "learning_rate": 6.5505226480836235e-06,
1807
+ "loss": 0.5937789440155029,
1808
+ "mean_token_accuracy": 0.8217935189604759,
1809
+ "num_tokens": 23786805.0,
1810
+ "step": 1800
1811
+ },
1812
+ {
1813
+ "entropy": 0.5733942896127701,
1814
+ "epoch": 0.7095952170930119,
1815
+ "grad_norm": 1.6171875,
1816
+ "learning_rate": 6.463414634146342e-06,
1817
+ "loss": 0.5705701828002929,
1818
+ "mean_token_accuracy": 0.8282980337738991,
1819
+ "num_tokens": 23923037.0,
1820
+ "step": 1810
1821
+ },
1822
+ {
1823
+ "entropy": 0.5554268516600132,
1824
+ "epoch": 0.7135156326570616,
1825
+ "grad_norm": 1.7890625,
1826
+ "learning_rate": 6.37630662020906e-06,
1827
+ "loss": 0.5606462955474854,
1828
+ "mean_token_accuracy": 0.8297950372099876,
1829
+ "num_tokens": 24053827.0,
1830
+ "step": 1820
1831
+ },
1832
+ {
1833
+ "entropy": 0.6039480961859226,
1834
+ "epoch": 0.7174360482211114,
1835
+ "grad_norm": 1.703125,
1836
+ "learning_rate": 6.289198606271778e-06,
1837
+ "loss": 0.6047250747680664,
1838
+ "mean_token_accuracy": 0.8179130852222443,
1839
+ "num_tokens": 24186062.0,
1840
+ "step": 1830
1841
+ },
1842
+ {
1843
+ "entropy": 0.570480278134346,
1844
+ "epoch": 0.7213564637851613,
1845
+ "grad_norm": 1.6796875,
1846
+ "learning_rate": 6.202090592334495e-06,
1847
+ "loss": 0.5673944473266601,
1848
+ "mean_token_accuracy": 0.8284930318593979,
1849
+ "num_tokens": 24315993.0,
1850
+ "step": 1840
1851
+ },
1852
+ {
1853
+ "entropy": 0.5931850384920836,
1854
+ "epoch": 0.725276879349211,
1855
+ "grad_norm": 1.78125,
1856
+ "learning_rate": 6.114982578397213e-06,
1857
+ "loss": 0.5964553833007813,
1858
+ "mean_token_accuracy": 0.8212155938148499,
1859
+ "num_tokens": 24446858.0,
1860
+ "step": 1850
1861
+ },
1862
+ {
1863
+ "entropy": 0.6047229062765836,
1864
+ "epoch": 0.7291972949132608,
1865
+ "grad_norm": 1.828125,
1866
+ "learning_rate": 6.027874564459931e-06,
1867
+ "loss": 0.6088790416717529,
1868
+ "mean_token_accuracy": 0.817912295460701,
1869
+ "num_tokens": 24580558.0,
1870
+ "step": 1860
1871
+ },
1872
+ {
1873
+ "entropy": 0.5825779817998409,
1874
+ "epoch": 0.7331177104773106,
1875
+ "grad_norm": 1.859375,
1876
+ "learning_rate": 5.940766550522649e-06,
1877
+ "loss": 0.5805138111114502,
1878
+ "mean_token_accuracy": 0.8254112429916859,
1879
+ "num_tokens": 24718156.0,
1880
+ "step": 1870
1881
+ },
1882
+ {
1883
+ "entropy": 0.5747309617698193,
1884
+ "epoch": 0.7370381260413604,
1885
+ "grad_norm": 1.6953125,
1886
+ "learning_rate": 5.853658536585366e-06,
1887
+ "loss": 0.5785604953765869,
1888
+ "mean_token_accuracy": 0.8248820044100285,
1889
+ "num_tokens": 24853874.0,
1890
+ "step": 1880
1891
+ },
1892
+ {
1893
+ "entropy": 0.601536936685443,
1894
+ "epoch": 0.7409585416054102,
1895
+ "grad_norm": 1.8671875,
1896
+ "learning_rate": 5.766550522648084e-06,
1897
+ "loss": 0.6086882591247559,
1898
+ "mean_token_accuracy": 0.8180929891765117,
1899
+ "num_tokens": 24983767.0,
1900
+ "step": 1890
1901
+ },
1902
+ {
1903
+ "entropy": 0.5778564881533385,
1904
+ "epoch": 0.74487895716946,
1905
+ "grad_norm": 1.6484375,
1906
+ "learning_rate": 5.679442508710802e-06,
1907
+ "loss": 0.5777944087982178,
1908
+ "mean_token_accuracy": 0.8248727723956109,
1909
+ "num_tokens": 25112481.0,
1910
+ "step": 1900
1911
+ },
1912
+ {
1913
+ "entropy": 0.5561993703246116,
1914
+ "epoch": 0.7487993727335097,
1915
+ "grad_norm": 1.5859375,
1916
+ "learning_rate": 5.59233449477352e-06,
1917
+ "loss": 0.5545031547546386,
1918
+ "mean_token_accuracy": 0.8313787803053856,
1919
+ "num_tokens": 25244621.0,
1920
+ "step": 1910
1921
+ },
1922
+ {
1923
+ "entropy": 0.5606953276321291,
1924
+ "epoch": 0.7527197882975596,
1925
+ "grad_norm": 1.640625,
1926
+ "learning_rate": 5.505226480836237e-06,
1927
+ "loss": 0.5607188224792481,
1928
+ "mean_token_accuracy": 0.8297309316694736,
1929
+ "num_tokens": 25376819.0,
1930
+ "step": 1920
1931
+ },
1932
+ {
1933
+ "entropy": 0.5783195950090885,
1934
+ "epoch": 0.7566402038616094,
1935
+ "grad_norm": 1.78125,
1936
+ "learning_rate": 5.418118466898955e-06,
1937
+ "loss": 0.57740797996521,
1938
+ "mean_token_accuracy": 0.8247730396687984,
1939
+ "num_tokens": 25509601.0,
1940
+ "step": 1930
1941
+ },
1942
+ {
1943
+ "entropy": 0.5782700140029192,
1944
+ "epoch": 0.7605606194256591,
1945
+ "grad_norm": 1.6796875,
1946
+ "learning_rate": 5.331010452961673e-06,
1947
+ "loss": 0.5782852649688721,
1948
+ "mean_token_accuracy": 0.824606055766344,
1949
+ "num_tokens": 25643050.0,
1950
+ "step": 1940
1951
+ },
1952
+ {
1953
+ "entropy": 0.561905774474144,
1954
+ "epoch": 0.7644810349897089,
1955
+ "grad_norm": 1.6171875,
1956
+ "learning_rate": 5.243902439024391e-06,
1957
+ "loss": 0.5643265724182129,
1958
+ "mean_token_accuracy": 0.8287928692996502,
1959
+ "num_tokens": 25769796.0,
1960
+ "step": 1950
1961
+ },
1962
+ {
1963
+ "entropy": 0.5690379086881876,
1964
+ "epoch": 0.7684014505537587,
1965
+ "grad_norm": 1.6796875,
1966
+ "learning_rate": 5.156794425087108e-06,
1967
+ "loss": 0.5731242179870606,
1968
+ "mean_token_accuracy": 0.8275946989655495,
1969
+ "num_tokens": 25904632.0,
1970
+ "step": 1960
1971
+ },
1972
+ {
1973
+ "entropy": 0.5504785589873791,
1974
+ "epoch": 0.7723218661178085,
1975
+ "grad_norm": 1.53125,
1976
+ "learning_rate": 5.0696864111498264e-06,
1977
+ "loss": 0.5516434192657471,
1978
+ "mean_token_accuracy": 0.8325082875788212,
1979
+ "num_tokens": 26035719.0,
1980
+ "step": 1970
1981
+ },
1982
+ {
1983
+ "entropy": 0.5725247742608189,
1984
+ "epoch": 0.7762422816818583,
1985
+ "grad_norm": 1.734375,
1986
+ "learning_rate": 4.982578397212544e-06,
1987
+ "loss": 0.5785993576049805,
1988
+ "mean_token_accuracy": 0.8263410687446594,
1989
+ "num_tokens": 26167472.0,
1990
+ "step": 1980
1991
+ },
1992
+ {
1993
+ "entropy": 0.5720619566738605,
1994
+ "epoch": 0.780162697245908,
1995
+ "grad_norm": 1.5390625,
1996
+ "learning_rate": 4.895470383275262e-06,
1997
+ "loss": 0.5713248252868652,
1998
+ "mean_token_accuracy": 0.8277824930846691,
1999
+ "num_tokens": 26301163.0,
2000
+ "step": 1990
2001
+ },
2002
+ {
2003
+ "entropy": 0.5604069098830223,
2004
+ "epoch": 0.7840831128099579,
2005
+ "grad_norm": 1.6875,
2006
+ "learning_rate": 4.8083623693379794e-06,
2007
+ "loss": 0.5647254467010498,
2008
+ "mean_token_accuracy": 0.8302777230739593,
2009
+ "num_tokens": 26430868.0,
2010
+ "step": 2000
2011
+ },
2012
+ {
2013
+ "entropy": 0.5837776899337769,
2014
+ "epoch": 0.7880035283740077,
2015
+ "grad_norm": 1.7734375,
2016
+ "learning_rate": 4.721254355400697e-06,
2017
+ "loss": 0.5858076095581055,
2018
+ "mean_token_accuracy": 0.8225198201835155,
2019
+ "num_tokens": 26560775.0,
2020
+ "step": 2010
2021
+ },
2022
+ {
2023
+ "entropy": 0.5733296349644661,
2024
+ "epoch": 0.7919239439380574,
2025
+ "grad_norm": 1.625,
2026
+ "learning_rate": 4.634146341463416e-06,
2027
+ "loss": 0.5758630275726319,
2028
+ "mean_token_accuracy": 0.8253522992134095,
2029
+ "num_tokens": 26693234.0,
2030
+ "step": 2020
2031
+ },
2032
+ {
2033
+ "entropy": 0.557684974372387,
2034
+ "epoch": 0.7958443595021072,
2035
+ "grad_norm": 1.7109375,
2036
+ "learning_rate": 4.5470383275261325e-06,
2037
+ "loss": 0.5551248073577881,
2038
+ "mean_token_accuracy": 0.8318465322256088,
2039
+ "num_tokens": 26828863.0,
2040
+ "step": 2030
2041
+ },
2042
+ {
2043
+ "entropy": 0.5628693040460349,
2044
+ "epoch": 0.7997647750661571,
2045
+ "grad_norm": 1.75,
2046
+ "learning_rate": 4.45993031358885e-06,
2047
+ "loss": 0.571585750579834,
2048
+ "mean_token_accuracy": 0.8270311810076236,
2049
+ "num_tokens": 26960662.0,
2050
+ "step": 2040
2051
+ },
2052
+ {
2053
+ "entropy": 0.5897116485983134,
2054
+ "epoch": 0.8036851906302068,
2055
+ "grad_norm": 1.7421875,
2056
+ "learning_rate": 4.372822299651569e-06,
2057
+ "loss": 0.5966588497161865,
2058
+ "mean_token_accuracy": 0.8230571210384369,
2059
+ "num_tokens": 27091359.0,
2060
+ "step": 2050
2061
+ },
2062
+ {
2063
+ "entropy": 0.5869602940976619,
2064
+ "epoch": 0.8076056061942566,
2065
+ "grad_norm": 1.7109375,
2066
+ "learning_rate": 4.2857142857142855e-06,
2067
+ "loss": 0.588222885131836,
2068
+ "mean_token_accuracy": 0.8202385693788529,
2069
+ "num_tokens": 27225435.0,
2070
+ "step": 2060
2071
+ },
2072
+ {
2073
+ "entropy": 0.6028570268303156,
2074
+ "epoch": 0.8115260217583063,
2075
+ "grad_norm": 1.6328125,
2076
+ "learning_rate": 4.198606271777004e-06,
2077
+ "loss": 0.606553316116333,
2078
+ "mean_token_accuracy": 0.8188532285392285,
2079
+ "num_tokens": 27358034.0,
2080
+ "step": 2070
2081
+ },
2082
+ {
2083
+ "entropy": 0.5583208829164505,
2084
+ "epoch": 0.8154464373223562,
2085
+ "grad_norm": 1.78125,
2086
+ "learning_rate": 4.111498257839722e-06,
2087
+ "loss": 0.5582189083099365,
2088
+ "mean_token_accuracy": 0.8306586474180222,
2089
+ "num_tokens": 27490146.0,
2090
+ "step": 2080
2091
+ },
2092
+ {
2093
+ "entropy": 0.5879612069576978,
2094
+ "epoch": 0.819366852886406,
2095
+ "grad_norm": 1.8359375,
2096
+ "learning_rate": 4.024390243902439e-06,
2097
+ "loss": 0.5868856906890869,
2098
+ "mean_token_accuracy": 0.821238712221384,
2099
+ "num_tokens": 27621710.0,
2100
+ "step": 2090
2101
+ },
2102
+ {
2103
+ "entropy": 0.5645837895572186,
2104
+ "epoch": 0.8232872684504557,
2105
+ "grad_norm": 1.6640625,
2106
+ "learning_rate": 3.937282229965157e-06,
2107
+ "loss": 0.5639263153076172,
2108
+ "mean_token_accuracy": 0.8281869366765022,
2109
+ "num_tokens": 27758436.0,
2110
+ "step": 2100
2111
+ },
2112
+ {
2113
+ "entropy": 0.5650495100766421,
2114
+ "epoch": 0.8272076840145055,
2115
+ "grad_norm": 1.71875,
2116
+ "learning_rate": 3.850174216027875e-06,
2117
+ "loss": 0.5659061908721924,
2118
+ "mean_token_accuracy": 0.8303346544504165,
2119
+ "num_tokens": 27886659.0,
2120
+ "step": 2110
2121
+ },
2122
+ {
2123
+ "entropy": 0.5669478211551905,
2124
+ "epoch": 0.8311280995785554,
2125
+ "grad_norm": 1.875,
2126
+ "learning_rate": 3.7630662020905927e-06,
2127
+ "loss": 0.5676323890686035,
2128
+ "mean_token_accuracy": 0.8264915093779563,
2129
+ "num_tokens": 28020869.0,
2130
+ "step": 2120
2131
+ },
2132
+ {
2133
+ "entropy": 0.5540701054036618,
2134
+ "epoch": 0.8350485151426051,
2135
+ "grad_norm": 1.71875,
2136
+ "learning_rate": 3.67595818815331e-06,
2137
+ "loss": 0.5579257011413574,
2138
+ "mean_token_accuracy": 0.8317541219294071,
2139
+ "num_tokens": 28152491.0,
2140
+ "step": 2130
2141
+ },
2142
+ {
2143
+ "entropy": 0.5800126396119595,
2144
+ "epoch": 0.8389689307066549,
2145
+ "grad_norm": 1.8125,
2146
+ "learning_rate": 3.588850174216028e-06,
2147
+ "loss": 0.5853462219238281,
2148
+ "mean_token_accuracy": 0.8237727522850037,
2149
+ "num_tokens": 28284698.0,
2150
+ "step": 2140
2151
+ },
2152
+ {
2153
+ "entropy": 0.6118997160345316,
2154
+ "epoch": 0.8428893462707047,
2155
+ "grad_norm": 1.6796875,
2156
+ "learning_rate": 3.501742160278746e-06,
2157
+ "loss": 0.6129735469818115,
2158
+ "mean_token_accuracy": 0.8170387588441372,
2159
+ "num_tokens": 28413374.0,
2160
+ "step": 2150
2161
+ },
2162
+ {
2163
+ "entropy": 0.5741523541510105,
2164
+ "epoch": 0.8468097618347545,
2165
+ "grad_norm": 1.703125,
2166
+ "learning_rate": 3.414634146341464e-06,
2167
+ "loss": 0.5773540019989014,
2168
+ "mean_token_accuracy": 0.8257443487644196,
2169
+ "num_tokens": 28545962.0,
2170
+ "step": 2160
2171
+ },
2172
+ {
2173
+ "entropy": 0.5787751715630293,
2174
+ "epoch": 0.8507301773988043,
2175
+ "grad_norm": 1.6796875,
2176
+ "learning_rate": 3.3275261324041815e-06,
2177
+ "loss": 0.5790078163146972,
2178
+ "mean_token_accuracy": 0.825350683927536,
2179
+ "num_tokens": 28680445.0,
2180
+ "step": 2170
2181
+ },
2182
+ {
2183
+ "entropy": 0.5716471575200558,
2184
+ "epoch": 0.8546505929628541,
2185
+ "grad_norm": 1.6953125,
2186
+ "learning_rate": 3.240418118466899e-06,
2187
+ "loss": 0.5730648994445801,
2188
+ "mean_token_accuracy": 0.8268438093364239,
2189
+ "num_tokens": 28811694.0,
2190
+ "step": 2180
2191
+ },
2192
+ {
2193
+ "entropy": 0.5720145717263222,
2194
+ "epoch": 0.8585710085269038,
2195
+ "grad_norm": 1.84375,
2196
+ "learning_rate": 3.1533101045296173e-06,
2197
+ "loss": 0.5759499549865723,
2198
+ "mean_token_accuracy": 0.8254265673458576,
2199
+ "num_tokens": 28945890.0,
2200
+ "step": 2190
2201
+ },
2202
+ {
2203
+ "entropy": 0.5667183842509985,
2204
+ "epoch": 0.8624914240909537,
2205
+ "grad_norm": 1.796875,
2206
+ "learning_rate": 3.0662020905923345e-06,
2207
+ "loss": 0.567453670501709,
2208
+ "mean_token_accuracy": 0.8277363143861294,
2209
+ "num_tokens": 29075294.0,
2210
+ "step": 2200
2211
+ },
2212
+ {
2213
+ "entropy": 0.583325557038188,
2214
+ "epoch": 0.8664118396550035,
2215
+ "grad_norm": 1.7109375,
2216
+ "learning_rate": 2.9790940766550526e-06,
2217
+ "loss": 0.5870438098907471,
2218
+ "mean_token_accuracy": 0.8234334781765937,
2219
+ "num_tokens": 29207413.0,
2220
+ "step": 2210
2221
+ },
2222
+ {
2223
+ "entropy": 0.5614024806767702,
2224
+ "epoch": 0.8703322552190532,
2225
+ "grad_norm": 1.65625,
2226
+ "learning_rate": 2.8919860627177703e-06,
2227
+ "loss": 0.5655649185180665,
2228
+ "mean_token_accuracy": 0.8296878322958946,
2229
+ "num_tokens": 29340755.0,
2230
+ "step": 2220
2231
+ },
2232
+ {
2233
+ "entropy": 0.5752765364944935,
2234
+ "epoch": 0.874252670783103,
2235
+ "grad_norm": 1.65625,
2236
+ "learning_rate": 2.8048780487804884e-06,
2237
+ "loss": 0.5743560314178466,
2238
+ "mean_token_accuracy": 0.8251941919326782,
2239
+ "num_tokens": 29474374.0,
2240
+ "step": 2230
2241
+ },
2242
+ {
2243
+ "entropy": 0.5772768836468458,
2244
+ "epoch": 0.8781730863471529,
2245
+ "grad_norm": 1.5234375,
2246
+ "learning_rate": 2.7177700348432056e-06,
2247
+ "loss": 0.5725958347320557,
2248
+ "mean_token_accuracy": 0.826750586181879,
2249
+ "num_tokens": 29607009.0,
2250
+ "step": 2240
2251
+ },
2252
+ {
2253
+ "entropy": 0.5837304752320052,
2254
+ "epoch": 0.8820935019112026,
2255
+ "grad_norm": 1.875,
2256
+ "learning_rate": 2.6306620209059237e-06,
2257
+ "loss": 0.588666296005249,
2258
+ "mean_token_accuracy": 0.8247058235108853,
2259
+ "num_tokens": 29738318.0,
2260
+ "step": 2250
2261
+ },
2262
+ {
2263
+ "entropy": 0.5540932007133961,
2264
+ "epoch": 0.8860139174752524,
2265
+ "grad_norm": 1.8359375,
2266
+ "learning_rate": 2.5435540069686414e-06,
2267
+ "loss": 0.5564557552337647,
2268
+ "mean_token_accuracy": 0.8303055338561535,
2269
+ "num_tokens": 29871530.0,
2270
+ "step": 2260
2271
+ },
2272
+ {
2273
+ "entropy": 0.5647958315908909,
2274
+ "epoch": 0.8899343330393021,
2275
+ "grad_norm": 1.90625,
2276
+ "learning_rate": 2.456445993031359e-06,
2277
+ "loss": 0.5653763771057129,
2278
+ "mean_token_accuracy": 0.8286120660603047,
2279
+ "num_tokens": 30004318.0,
2280
+ "step": 2270
2281
+ },
2282
+ {
2283
+ "entropy": 0.5644253201782703,
2284
+ "epoch": 0.8938547486033519,
2285
+ "grad_norm": 1.734375,
2286
+ "learning_rate": 2.3693379790940767e-06,
2287
+ "loss": 0.5634883880615235,
2288
+ "mean_token_accuracy": 0.8275229759514332,
2289
+ "num_tokens": 30136805.0,
2290
+ "step": 2280
2291
+ },
2292
+ {
2293
+ "entropy": 0.5744159776717425,
2294
+ "epoch": 0.8977751641674018,
2295
+ "grad_norm": 1.7421875,
2296
+ "learning_rate": 2.282229965156795e-06,
2297
+ "loss": 0.5716060161590576,
2298
+ "mean_token_accuracy": 0.82558439001441,
2299
+ "num_tokens": 30267921.0,
2300
+ "step": 2290
2301
+ },
2302
+ {
2303
+ "entropy": 0.5650353617966175,
2304
+ "epoch": 0.9016955797314515,
2305
+ "grad_norm": 1.6953125,
2306
+ "learning_rate": 2.1951219512195125e-06,
2307
+ "loss": 0.5693279266357422,
2308
+ "mean_token_accuracy": 0.8278473079204559,
2309
+ "num_tokens": 30402696.0,
2310
+ "step": 2300
2311
+ },
2312
+ {
2313
+ "entropy": 0.5965178959071636,
2314
+ "epoch": 0.9056159952955013,
2315
+ "grad_norm": 1.71875,
2316
+ "learning_rate": 2.10801393728223e-06,
2317
+ "loss": 0.6028908729553223,
2318
+ "mean_token_accuracy": 0.8189698673784733,
2319
+ "num_tokens": 30536391.0,
2320
+ "step": 2310
2321
+ },
2322
+ {
2323
+ "entropy": 0.601612939313054,
2324
+ "epoch": 0.9095364108595511,
2325
+ "grad_norm": 1.796875,
2326
+ "learning_rate": 2.020905923344948e-06,
2327
+ "loss": 0.6042677402496338,
2328
+ "mean_token_accuracy": 0.8197429470717907,
2329
+ "num_tokens": 30669243.0,
2330
+ "step": 2320
2331
+ },
2332
+ {
2333
+ "entropy": 0.5706041093915701,
2334
+ "epoch": 0.9134568264236009,
2335
+ "grad_norm": 1.8984375,
2336
+ "learning_rate": 1.9337979094076655e-06,
2337
+ "loss": 0.5693963527679443,
2338
+ "mean_token_accuracy": 0.8271980971097946,
2339
+ "num_tokens": 30800574.0,
2340
+ "step": 2330
2341
+ },
2342
+ {
2343
+ "entropy": 0.5640899043530225,
2344
+ "epoch": 0.9173772419876507,
2345
+ "grad_norm": 1.6484375,
2346
+ "learning_rate": 1.8466898954703836e-06,
2347
+ "loss": 0.5705442905426026,
2348
+ "mean_token_accuracy": 0.8264196418225765,
2349
+ "num_tokens": 30933674.0,
2350
+ "step": 2340
2351
+ },
2352
+ {
2353
+ "entropy": 0.5657230116426945,
2354
+ "epoch": 0.9212976575517005,
2355
+ "grad_norm": 1.609375,
2356
+ "learning_rate": 1.7595818815331012e-06,
2357
+ "loss": 0.570911979675293,
2358
+ "mean_token_accuracy": 0.8267294079065323,
2359
+ "num_tokens": 31065102.0,
2360
+ "step": 2350
2361
+ },
2362
+ {
2363
+ "entropy": 0.562781137228012,
2364
+ "epoch": 0.9252180731157502,
2365
+ "grad_norm": 1.796875,
2366
+ "learning_rate": 1.6724738675958191e-06,
2367
+ "loss": 0.5703943729400635,
2368
+ "mean_token_accuracy": 0.8268945284187794,
2369
+ "num_tokens": 31195291.0,
2370
+ "step": 2360
2371
+ },
2372
+ {
2373
+ "entropy": 0.5673436559736729,
2374
+ "epoch": 0.9291384886798001,
2375
+ "grad_norm": 1.609375,
2376
+ "learning_rate": 1.5853658536585368e-06,
2377
+ "loss": 0.5722959995269775,
2378
+ "mean_token_accuracy": 0.8286142982542515,
2379
+ "num_tokens": 31327802.0,
2380
+ "step": 2370
2381
+ },
2382
+ {
2383
+ "entropy": 0.5538178488612175,
2384
+ "epoch": 0.9330589042438499,
2385
+ "grad_norm": 1.859375,
2386
+ "learning_rate": 1.4982578397212545e-06,
2387
+ "loss": 0.5541855812072753,
2388
+ "mean_token_accuracy": 0.8326897613704205,
2389
+ "num_tokens": 31454999.0,
2390
+ "step": 2380
2391
+ },
2392
+ {
2393
+ "entropy": 0.5489677041769028,
2394
+ "epoch": 0.9369793198078996,
2395
+ "grad_norm": 1.5625,
2396
+ "learning_rate": 1.4111498257839723e-06,
2397
+ "loss": 0.5547789573669434,
2398
+ "mean_token_accuracy": 0.8312770992517471,
2399
+ "num_tokens": 31592105.0,
2400
+ "step": 2390
2401
+ },
2402
+ {
2403
+ "entropy": 0.5815275024622679,
2404
+ "epoch": 0.9408997353719494,
2405
+ "grad_norm": 1.8671875,
2406
+ "learning_rate": 1.32404181184669e-06,
2407
+ "loss": 0.589658784866333,
2408
+ "mean_token_accuracy": 0.8223450765013695,
2409
+ "num_tokens": 31720785.0,
2410
+ "step": 2400
2411
+ },
2412
+ {
2413
+ "entropy": 0.5828426022082567,
2414
+ "epoch": 0.9448201509359992,
2415
+ "grad_norm": 1.71875,
2416
+ "learning_rate": 1.2369337979094077e-06,
2417
+ "loss": 0.5806317806243897,
2418
+ "mean_token_accuracy": 0.8260537378489972,
2419
+ "num_tokens": 31852738.0,
2420
+ "step": 2410
2421
+ },
2422
+ {
2423
+ "entropy": 0.5780322037637233,
2424
+ "epoch": 0.948740566500049,
2425
+ "grad_norm": 1.671875,
2426
+ "learning_rate": 1.1498257839721255e-06,
2427
+ "loss": 0.573900556564331,
2428
+ "mean_token_accuracy": 0.8251914605498314,
2429
+ "num_tokens": 31985785.0,
2430
+ "step": 2420
2431
+ },
2432
+ {
2433
+ "entropy": 0.5561736188828945,
2434
+ "epoch": 0.9526609820640988,
2435
+ "grad_norm": 1.734375,
2436
+ "learning_rate": 1.0627177700348432e-06,
2437
+ "loss": 0.5585554599761963,
2438
+ "mean_token_accuracy": 0.8310537829995155,
2439
+ "num_tokens": 32116876.0,
2440
+ "step": 2430
2441
+ },
2442
+ {
2443
+ "entropy": 0.5960801500827074,
2444
+ "epoch": 0.9565813976281485,
2445
+ "grad_norm": 1.8515625,
2446
+ "learning_rate": 9.75609756097561e-07,
2447
+ "loss": 0.6023911476135254,
2448
+ "mean_token_accuracy": 0.8196251414716244,
2449
+ "num_tokens": 32251738.0,
2450
+ "step": 2440
2451
+ },
2452
+ {
2453
+ "entropy": 0.5717530060559511,
2454
+ "epoch": 0.9605018131921984,
2455
+ "grad_norm": 1.4609375,
2456
+ "learning_rate": 8.885017421602789e-07,
2457
+ "loss": 0.5710067749023438,
2458
+ "mean_token_accuracy": 0.8258462838828564,
2459
+ "num_tokens": 32386153.0,
2460
+ "step": 2450
2461
+ },
2462
+ {
2463
+ "entropy": 0.5853737752884627,
2464
+ "epoch": 0.9644222287562482,
2465
+ "grad_norm": 1.875,
2466
+ "learning_rate": 8.013937282229965e-07,
2467
+ "loss": 0.59298734664917,
2468
+ "mean_token_accuracy": 0.8215095445513725,
2469
+ "num_tokens": 32521937.0,
2470
+ "step": 2460
2471
+ },
2472
+ {
2473
+ "entropy": 0.5701922554522753,
2474
+ "epoch": 0.9683426443202979,
2475
+ "grad_norm": 1.7578125,
2476
+ "learning_rate": 7.142857142857143e-07,
2477
+ "loss": 0.5723202228546143,
2478
+ "mean_token_accuracy": 0.8279052451252937,
2479
+ "num_tokens": 32653667.0,
2480
+ "step": 2470
2481
+ },
2482
+ {
2483
+ "entropy": 0.5777422614395619,
2484
+ "epoch": 0.9722630598843477,
2485
+ "grad_norm": 1.6875,
2486
+ "learning_rate": 6.271777003484321e-07,
2487
+ "loss": 0.581749153137207,
2488
+ "mean_token_accuracy": 0.8253626257181168,
2489
+ "num_tokens": 32787099.0,
2490
+ "step": 2480
2491
+ },
2492
+ {
2493
+ "entropy": 0.5758198279887438,
2494
+ "epoch": 0.9761834754483976,
2495
+ "grad_norm": 1.6015625,
2496
+ "learning_rate": 5.400696864111499e-07,
2497
+ "loss": 0.5754932403564453,
2498
+ "mean_token_accuracy": 0.8263446964323521,
2499
+ "num_tokens": 32919330.0,
2500
+ "step": 2490
2501
+ },
2502
+ {
2503
+ "entropy": 0.5611678268760443,
2504
+ "epoch": 0.9801038910124473,
2505
+ "grad_norm": 1.703125,
2506
+ "learning_rate": 4.5296167247386764e-07,
2507
+ "loss": 0.5594530582427979,
2508
+ "mean_token_accuracy": 0.8305567562580108,
2509
+ "num_tokens": 33052864.0,
2510
+ "step": 2500
2511
+ },
2512
+ {
2513
+ "entropy": 0.5629599597305059,
2514
+ "epoch": 0.9840243065764971,
2515
+ "grad_norm": 1.7578125,
2516
+ "learning_rate": 3.6585365853658536e-07,
2517
+ "loss": 0.5671555042266846,
2518
+ "mean_token_accuracy": 0.8285158857703209,
2519
+ "num_tokens": 33187097.0,
2520
+ "step": 2510
2521
+ },
2522
+ {
2523
+ "entropy": 0.5846845716238022,
2524
+ "epoch": 0.9879447221405468,
2525
+ "grad_norm": 1.640625,
2526
+ "learning_rate": 2.7874564459930313e-07,
2527
+ "loss": 0.5863873481750488,
2528
+ "mean_token_accuracy": 0.8224268153309822,
2529
+ "num_tokens": 33324336.0,
2530
+ "step": 2520
2531
+ },
2532
+ {
2533
+ "entropy": 0.5745269916951656,
2534
+ "epoch": 0.9918651377045967,
2535
+ "grad_norm": 1.8046875,
2536
+ "learning_rate": 1.916376306620209e-07,
2537
+ "loss": 0.5790841579437256,
2538
+ "mean_token_accuracy": 0.8244781382381916,
2539
+ "num_tokens": 33457098.0,
2540
+ "step": 2530
2541
+ },
2542
+ {
2543
+ "entropy": 0.5654200822114944,
2544
+ "epoch": 0.9957855532686465,
2545
+ "grad_norm": 1.6640625,
2546
+ "learning_rate": 1.045296167247387e-07,
2547
+ "loss": 0.5639691829681397,
2548
+ "mean_token_accuracy": 0.8280748799443245,
2549
+ "num_tokens": 33590704.0,
2550
+ "step": 2540
2551
+ },
2552
+ {
2553
+ "entropy": 0.5788687650114298,
2554
+ "epoch": 0.9997059688326962,
2555
+ "grad_norm": 1.7265625,
2556
+ "learning_rate": 1.7421602787456446e-08,
2557
+ "loss": 0.5817639350891113,
2558
+ "mean_token_accuracy": 0.8252955496311187,
2559
+ "num_tokens": 33724217.0,
2560
+ "step": 2550
2561
+ }
2562
+ ],
2563
+ "logging_steps": 10,
2564
+ "max_steps": 2551,
2565
+ "num_input_tokens_seen": 0,
2566
+ "num_train_epochs": 1,
2567
+ "save_steps": 500,
2568
+ "stateful_callbacks": {
2569
+ "TrainerControl": {
2570
+ "args": {
2571
+ "should_epoch_stop": false,
2572
+ "should_evaluate": false,
2573
+ "should_log": false,
2574
+ "should_save": true,
2575
+ "should_training_stop": true
2576
+ },
2577
+ "attributes": {}
2578
+ }
2579
+ },
2580
+ "total_flos": 3.2075681983043174e+17,
2581
+ "train_batch_size": 2,
2582
+ "trial_name": null,
2583
+ "trial_params": null
2584
+ }
checkpoint-2551/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf5271d53e5ecf5b0e849ec9b9f8933cba573d9ea8200a6a32b9985bb49d32a6
3
+ size 5713
config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 151643,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": true,
59
+ "transformers_version": "5.7.0",
60
+ "use_cache": false,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 151936
63
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.51.0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77a976b0308720d09d1989a48aee5169ffdcf716f71404832899518b7211c1d
3
+ size 3441185608
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null,
30
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}"
31
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf5271d53e5ecf5b0e849ec9b9f8933cba573d9ea8200a6a32b9985bb49d32a6
3
+ size 5713