camilablank commited on
Commit
1bc2c2e
·
verified ·
1 Parent(s): 8a3b309

Upload pirate_L16_a150 seed_42 (final adapter + all intermediate checkpoints)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. README.md +62 -0
  3. adapter_config.json +48 -0
  4. adapter_model.safetensors +3 -0
  5. chat_template.jinja +54 -0
  6. checkpoint-1086/README.md +209 -0
  7. checkpoint-1086/adapter_config.json +48 -0
  8. checkpoint-1086/adapter_model.safetensors +3 -0
  9. checkpoint-1086/chat_template.jinja +54 -0
  10. checkpoint-1086/tokenizer.json +3 -0
  11. checkpoint-1086/tokenizer_config.json +29 -0
  12. checkpoint-1086/trainer_state.json +1136 -0
  13. checkpoint-1086/training_args.bin +3 -0
  14. checkpoint-1629/README.md +209 -0
  15. checkpoint-1629/adapter_config.json +48 -0
  16. checkpoint-1629/adapter_model.safetensors +3 -0
  17. checkpoint-1629/chat_template.jinja +54 -0
  18. checkpoint-1629/tokenizer.json +3 -0
  19. checkpoint-1629/tokenizer_config.json +29 -0
  20. checkpoint-1629/trainer_state.json +1687 -0
  21. checkpoint-1629/training_args.bin +3 -0
  22. checkpoint-2172/README.md +209 -0
  23. checkpoint-2172/adapter_config.json +48 -0
  24. checkpoint-2172/adapter_model.safetensors +3 -0
  25. checkpoint-2172/chat_template.jinja +54 -0
  26. checkpoint-2172/tokenizer.json +3 -0
  27. checkpoint-2172/tokenizer_config.json +29 -0
  28. checkpoint-2172/trainer_state.json +2248 -0
  29. checkpoint-2172/training_args.bin +3 -0
  30. checkpoint-2715/README.md +209 -0
  31. checkpoint-2715/adapter_config.json +48 -0
  32. checkpoint-2715/adapter_model.safetensors +3 -0
  33. checkpoint-2715/chat_template.jinja +54 -0
  34. checkpoint-2715/tokenizer.json +3 -0
  35. checkpoint-2715/tokenizer_config.json +29 -0
  36. checkpoint-2715/trainer_state.json +2799 -0
  37. checkpoint-2715/training_args.bin +3 -0
  38. checkpoint-3258/README.md +209 -0
  39. checkpoint-3258/adapter_config.json +48 -0
  40. checkpoint-3258/adapter_model.safetensors +3 -0
  41. checkpoint-3258/chat_template.jinja +54 -0
  42. checkpoint-3258/tokenizer.json +3 -0
  43. checkpoint-3258/tokenizer_config.json +29 -0
  44. checkpoint-3258/trainer_state.json +0 -0
  45. checkpoint-3258/training_args.bin +3 -0
  46. checkpoint-3801/README.md +209 -0
  47. checkpoint-3801/adapter_config.json +48 -0
  48. checkpoint-3801/adapter_model.safetensors +3 -0
  49. checkpoint-3801/chat_template.jinja +54 -0
  50. checkpoint-3801/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1086/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-1629/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-2172/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-2715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-3258/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-3801/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-4344/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-4887/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-543/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-5430/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ model_name: seed_42
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for seed_42
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/camilab-stanford-university/subliminal_learning/runs/9746rvh5)
34
+
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.19.1
42
+ - TRL: 1.2.0
43
+ - Transformers: 5.5.4
44
+ - Pytorch: 2.10.0
45
+ - Datasets: 4.8.4
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @software{vonwerra2020trl,
56
+ title = {{TRL: Transformers Reinforcement Learning}},
57
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
58
+ license = {Apache-2.0},
59
+ url = {https://github.com/huggingface/trl},
60
+ year = {2020}
61
+ }
62
+ ```
adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44a37f4e701d0d74b2f3087cbbb4d2cce354e5bfcb1651dbb4d48e82fd2234d7
3
+ size 80792096
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1086/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-1086/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-1086/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eace20f3ff41af75c2ea9c9643e6063d8734d60b4b3bcec95f05c8940d3430be
3
+ size 80792096
checkpoint-1086/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1086/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-1086/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-1086/trainer_state.json ADDED
@@ -0,0 +1,1136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1086,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2237394809722901,
14
+ "epoch": 0.01841620626151013,
15
+ "grad_norm": 5.082435607910156,
16
+ "learning_rate": 3.308823529411765e-06,
17
+ "loss": 0.9237876892089844,
18
+ "mean_token_accuracy": 0.7685343027114868,
19
+ "num_tokens": 205423.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2295925617218018,
24
+ "epoch": 0.03683241252302026,
25
+ "grad_norm": 4.672000408172607,
26
+ "learning_rate": 6.985294117647059e-06,
27
+ "loss": 0.8900892257690429,
28
+ "mean_token_accuracy": 0.7677771031856537,
29
+ "num_tokens": 410849.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2285718679428101,
34
+ "epoch": 0.055248618784530384,
35
+ "grad_norm": 1.4828118085861206,
36
+ "learning_rate": 1.0661764705882354e-05,
37
+ "loss": 0.5975452899932862,
38
+ "mean_token_accuracy": 0.8146551787853241,
39
+ "num_tokens": 616438.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.210776400566101,
44
+ "epoch": 0.07366482504604052,
45
+ "grad_norm": 0.7761328816413879,
46
+ "learning_rate": 1.4338235294117647e-05,
47
+ "loss": 0.40664992332458494,
48
+ "mean_token_accuracy": 0.8699092030525207,
49
+ "num_tokens": 822118.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.200321125984192,
54
+ "epoch": 0.09208103130755065,
55
+ "grad_norm": 0.5363371968269348,
56
+ "learning_rate": 1.8014705882352943e-05,
57
+ "loss": 0.3313469409942627,
58
+ "mean_token_accuracy": 0.8904915869235992,
59
+ "num_tokens": 1027941.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.1809936046600342,
64
+ "epoch": 0.11049723756906077,
65
+ "grad_norm": 0.39541518688201904,
66
+ "learning_rate": 2.1691176470588237e-05,
67
+ "loss": 0.27568228244781495,
68
+ "mean_token_accuracy": 0.9047131836414337,
69
+ "num_tokens": 1233620.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.169810914993286,
74
+ "epoch": 0.1289134438305709,
75
+ "grad_norm": 0.341960072517395,
76
+ "learning_rate": 2.536764705882353e-05,
77
+ "loss": 0.245219087600708,
78
+ "mean_token_accuracy": 0.9150686681270599,
79
+ "num_tokens": 1438656.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.1652960777282715,
84
+ "epoch": 0.14732965009208104,
85
+ "grad_norm": 0.36872178316116333,
86
+ "learning_rate": 2.9044117647058828e-05,
87
+ "loss": 0.2220149040222168,
88
+ "mean_token_accuracy": 0.9224777698516846,
89
+ "num_tokens": 1643877.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.154341197013855,
94
+ "epoch": 0.16574585635359115,
95
+ "grad_norm": 0.4152425229549408,
96
+ "learning_rate": 3.272058823529412e-05,
97
+ "loss": 0.2002798557281494,
98
+ "mean_token_accuracy": 0.9285802960395813,
99
+ "num_tokens": 1849506.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.1507258892059327,
104
+ "epoch": 0.1841620626151013,
105
+ "grad_norm": 0.47647765278816223,
106
+ "learning_rate": 3.639705882352941e-05,
107
+ "loss": 0.18871363401412963,
108
+ "mean_token_accuracy": 0.9318056285381318,
109
+ "num_tokens": 2055071.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.1455535531044005,
114
+ "epoch": 0.20257826887661143,
115
+ "grad_norm": 0.4853009581565857,
116
+ "learning_rate": 4.007352941176471e-05,
117
+ "loss": 0.17836341857910157,
118
+ "mean_token_accuracy": 0.9367631554603577,
119
+ "num_tokens": 2260643.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.1402526497840881,
124
+ "epoch": 0.22099447513812154,
125
+ "grad_norm": 0.4455392360687256,
126
+ "learning_rate": 4.375e-05,
127
+ "loss": 0.16921783685684205,
128
+ "mean_token_accuracy": 0.9386959195137023,
129
+ "num_tokens": 2466085.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.1374777555465698,
134
+ "epoch": 0.23941068139963168,
135
+ "grad_norm": 0.5880279541015625,
136
+ "learning_rate": 4.742647058823529e-05,
137
+ "loss": 0.15989291667938232,
138
+ "mean_token_accuracy": 0.9421182632446289,
139
+ "num_tokens": 2671024.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1273940205574036,
144
+ "epoch": 0.2578268876611418,
145
+ "grad_norm": 0.612959086894989,
146
+ "learning_rate": 5.110294117647059e-05,
147
+ "loss": 0.14701461791992188,
148
+ "mean_token_accuracy": 0.9463540315628052,
149
+ "num_tokens": 2876848.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1263513088226318,
154
+ "epoch": 0.27624309392265195,
155
+ "grad_norm": 0.5695255398750305,
156
+ "learning_rate": 5.477941176470589e-05,
157
+ "loss": 0.14604382514953612,
158
+ "mean_token_accuracy": 0.946351945400238,
159
+ "num_tokens": 3082589.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1290789365768432,
164
+ "epoch": 0.2946593001841621,
165
+ "grad_norm": 0.6608090996742249,
166
+ "learning_rate": 5.845588235294118e-05,
167
+ "loss": 0.1409450054168701,
168
+ "mean_token_accuracy": 0.9481450319290161,
169
+ "num_tokens": 3287459.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1291529774665832,
174
+ "epoch": 0.31307550644567217,
175
+ "grad_norm": 0.652715802192688,
176
+ "learning_rate": 6.213235294117647e-05,
177
+ "loss": 0.14441155195236205,
178
+ "mean_token_accuracy": 0.9466125547885895,
179
+ "num_tokens": 3493682.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1244838953018188,
184
+ "epoch": 0.3314917127071823,
185
+ "grad_norm": 0.7815241813659668,
186
+ "learning_rate": 6.580882352941177e-05,
187
+ "loss": 0.13361064195632935,
188
+ "mean_token_accuracy": 0.9512295544147491,
189
+ "num_tokens": 3699573.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1217721104621887,
194
+ "epoch": 0.34990791896869244,
195
+ "grad_norm": 0.7933160066604614,
196
+ "learning_rate": 6.948529411764706e-05,
197
+ "loss": 0.13089522123336791,
198
+ "mean_token_accuracy": 0.9520221531391144,
199
+ "num_tokens": 3905156.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1206679105758668,
204
+ "epoch": 0.3683241252302026,
205
+ "grad_norm": 0.6815240383148193,
206
+ "learning_rate": 7.316176470588236e-05,
207
+ "loss": 0.13400404453277587,
208
+ "mean_token_accuracy": 0.9501322209835052,
209
+ "num_tokens": 4110570.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1161052227020263,
214
+ "epoch": 0.3867403314917127,
215
+ "grad_norm": 0.8297767639160156,
216
+ "learning_rate": 7.683823529411766e-05,
217
+ "loss": 0.13389937877655028,
218
+ "mean_token_accuracy": 0.9501932203769684,
219
+ "num_tokens": 4315834.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1098745942115784,
224
+ "epoch": 0.40515653775322286,
225
+ "grad_norm": 0.5943381786346436,
226
+ "learning_rate": 8.051470588235294e-05,
227
+ "loss": 0.13452907800674438,
228
+ "mean_token_accuracy": 0.9503286242485046,
229
+ "num_tokens": 4520807.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.100480353832245,
234
+ "epoch": 0.42357274401473294,
235
+ "grad_norm": 0.6094359755516052,
236
+ "learning_rate": 8.419117647058824e-05,
237
+ "loss": 0.12827746868133544,
238
+ "mean_token_accuracy": 0.952492094039917,
239
+ "num_tokens": 4725867.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.0901286959648133,
244
+ "epoch": 0.4419889502762431,
245
+ "grad_norm": 0.7240597605705261,
246
+ "learning_rate": 8.786764705882353e-05,
247
+ "loss": 0.12171242237091065,
248
+ "mean_token_accuracy": 0.953943532705307,
249
+ "num_tokens": 4931629.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.0885071873664856,
254
+ "epoch": 0.4604051565377532,
255
+ "grad_norm": 0.6939547657966614,
256
+ "learning_rate": 9.154411764705882e-05,
257
+ "loss": 0.12155698537826538,
258
+ "mean_token_accuracy": 0.9545870959758759,
259
+ "num_tokens": 5137285.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.086272156238556,
264
+ "epoch": 0.47882136279926335,
265
+ "grad_norm": 0.5752800703048706,
266
+ "learning_rate": 9.522058823529412e-05,
267
+ "loss": 0.12157790660858155,
268
+ "mean_token_accuracy": 0.9541126549243927,
269
+ "num_tokens": 5342575.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.0857678413391114,
274
+ "epoch": 0.4972375690607735,
275
+ "grad_norm": 0.7565123438835144,
276
+ "learning_rate": 9.889705882352942e-05,
277
+ "loss": 0.12349612712860107,
278
+ "mean_token_accuracy": 0.9535140514373779,
279
+ "num_tokens": 5547995.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.079762625694275,
284
+ "epoch": 0.5156537753222836,
285
+ "grad_norm": 0.6972768306732178,
286
+ "learning_rate": 9.999954556423843e-05,
287
+ "loss": 0.11875582933425903,
288
+ "mean_token_accuracy": 0.9556483089923858,
289
+ "num_tokens": 5753195.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.0742079138755798,
294
+ "epoch": 0.5340699815837937,
295
+ "grad_norm": 0.7821696996688843,
296
+ "learning_rate": 9.999731977631227e-05,
297
+ "loss": 0.11824090480804443,
298
+ "mean_token_accuracy": 0.9557521045207977,
299
+ "num_tokens": 5958236.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.0679773569107056,
304
+ "epoch": 0.5524861878453039,
305
+ "grad_norm": 0.5846888422966003,
306
+ "learning_rate": 9.999323925089486e-05,
307
+ "loss": 0.11707355976104736,
308
+ "mean_token_accuracy": 0.9554719448089599,
309
+ "num_tokens": 6163992.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.0655727863311768,
314
+ "epoch": 0.570902394106814,
315
+ "grad_norm": 0.5812502503395081,
316
+ "learning_rate": 9.998730413936037e-05,
317
+ "loss": 0.11371417045593261,
318
+ "mean_token_accuracy": 0.9576376020908356,
319
+ "num_tokens": 6369456.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.0607039332389832,
324
+ "epoch": 0.5893186003683242,
325
+ "grad_norm": 0.6238475441932678,
326
+ "learning_rate": 9.99795146618821e-05,
327
+ "loss": 0.11775733232498169,
328
+ "mean_token_accuracy": 0.9557221591472626,
329
+ "num_tokens": 6574833.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.0504255175590516,
334
+ "epoch": 0.6077348066298343,
335
+ "grad_norm": 0.6496815085411072,
336
+ "learning_rate": 9.996987110742422e-05,
337
+ "loss": 0.10904088020324706,
338
+ "mean_token_accuracy": 0.9585366368293762,
339
+ "num_tokens": 6780108.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.0456081986427308,
344
+ "epoch": 0.6261510128913443,
345
+ "grad_norm": 0.786702573299408,
346
+ "learning_rate": 9.995837383373119e-05,
347
+ "loss": 0.10642309188842773,
348
+ "mean_token_accuracy": 0.9596696078777314,
349
+ "num_tokens": 6985920.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.0455098271369934,
354
+ "epoch": 0.6445672191528545,
355
+ "grad_norm": 0.5473790168762207,
356
+ "learning_rate": 9.994502326731434e-05,
357
+ "loss": 0.10822961330413819,
358
+ "mean_token_accuracy": 0.959563136100769,
359
+ "num_tokens": 7191465.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.04240562915802,
364
+ "epoch": 0.6629834254143646,
365
+ "grad_norm": 0.6672356128692627,
366
+ "learning_rate": 9.992981990343614e-05,
367
+ "loss": 0.1110004186630249,
368
+ "mean_token_accuracy": 0.9582514643669129,
369
+ "num_tokens": 7396877.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0386811256408692,
374
+ "epoch": 0.6813996316758748,
375
+ "grad_norm": 0.698539674282074,
376
+ "learning_rate": 9.99127643060918e-05,
377
+ "loss": 0.107539963722229,
378
+ "mean_token_accuracy": 0.9593036234378814,
379
+ "num_tokens": 7602437.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.0311225533485413,
384
+ "epoch": 0.6998158379373849,
385
+ "grad_norm": 0.6629284024238586,
386
+ "learning_rate": 9.989385710798837e-05,
387
+ "loss": 0.1064023494720459,
388
+ "mean_token_accuracy": 0.9602205216884613,
389
+ "num_tokens": 7808142.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.030210506916046,
394
+ "epoch": 0.7182320441988951,
395
+ "grad_norm": 0.5616748929023743,
396
+ "learning_rate": 9.987309901052121e-05,
397
+ "loss": 0.10717041492462158,
398
+ "mean_token_accuracy": 0.9599347949028015,
399
+ "num_tokens": 8013407.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0208017826080322,
404
+ "epoch": 0.7366482504604052,
405
+ "grad_norm": 0.6329049468040466,
406
+ "learning_rate": 9.985049078374806e-05,
407
+ "loss": 0.10359601974487305,
408
+ "mean_token_accuracy": 0.9603756129741668,
409
+ "num_tokens": 8219040.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.015640377998352,
414
+ "epoch": 0.7550644567219152,
415
+ "grad_norm": 0.6516013741493225,
416
+ "learning_rate": 9.982603326636037e-05,
417
+ "loss": 0.10146439075469971,
418
+ "mean_token_accuracy": 0.9627702474594116,
419
+ "num_tokens": 8424678.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0105359435081482,
424
+ "epoch": 0.7734806629834254,
425
+ "grad_norm": 0.6920603513717651,
426
+ "learning_rate": 9.979972736565226e-05,
427
+ "loss": 0.10770498514175415,
428
+ "mean_token_accuracy": 0.9591470420360565,
429
+ "num_tokens": 8629868.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.9966452836990356,
434
+ "epoch": 0.7918968692449355,
435
+ "grad_norm": 0.6857476234436035,
436
+ "learning_rate": 9.977157405748687e-05,
437
+ "loss": 0.10282524824142455,
438
+ "mean_token_accuracy": 0.9612209022045135,
439
+ "num_tokens": 8835320.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.9945534646511078,
444
+ "epoch": 0.8103130755064457,
445
+ "grad_norm": 0.7208472490310669,
446
+ "learning_rate": 9.974157438626008e-05,
447
+ "loss": 0.10069938898086547,
448
+ "mean_token_accuracy": 0.9620070576667785,
449
+ "num_tokens": 9041123.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.979461395740509,
454
+ "epoch": 0.8287292817679558,
455
+ "grad_norm": 0.5071915984153748,
456
+ "learning_rate": 9.970972946486185e-05,
457
+ "loss": 0.09799174070358277,
458
+ "mean_token_accuracy": 0.9620374023914338,
459
+ "num_tokens": 9246361.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.9830998003482818,
464
+ "epoch": 0.8471454880294659,
465
+ "grad_norm": 0.8660802245140076,
466
+ "learning_rate": 9.967604047463493e-05,
467
+ "loss": 0.10378165245056152,
468
+ "mean_token_accuracy": 0.9606865763664245,
469
+ "num_tokens": 9451845.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.9813413023948669,
474
+ "epoch": 0.8655616942909761,
475
+ "grad_norm": 0.7642477750778198,
476
+ "learning_rate": 9.964050866533094e-05,
477
+ "loss": 0.1010061264038086,
478
+ "mean_token_accuracy": 0.9608745336532593,
479
+ "num_tokens": 9656802.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.967874163389206,
484
+ "epoch": 0.8839779005524862,
485
+ "grad_norm": 0.5987281799316406,
486
+ "learning_rate": 9.960313535506411e-05,
487
+ "loss": 0.10169394016265869,
488
+ "mean_token_accuracy": 0.9611998200416565,
489
+ "num_tokens": 9861719.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.9663491308689117,
494
+ "epoch": 0.9023941068139963,
495
+ "grad_norm": 0.6124638319015503,
496
+ "learning_rate": 9.956392193026239e-05,
497
+ "loss": 0.102389657497406,
498
+ "mean_token_accuracy": 0.9611884355545044,
499
+ "num_tokens": 10066673.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.959654438495636,
504
+ "epoch": 0.9208103130755064,
505
+ "grad_norm": 0.7873051762580872,
506
+ "learning_rate": 9.952286984561592e-05,
507
+ "loss": 0.10170392990112305,
508
+ "mean_token_accuracy": 0.9610928475856781,
509
+ "num_tokens": 10272091.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.9550537407398224,
514
+ "epoch": 0.9392265193370166,
515
+ "grad_norm": 0.6071968078613281,
516
+ "learning_rate": 9.947998062402313e-05,
517
+ "loss": 0.09448277950286865,
518
+ "mean_token_accuracy": 0.9648977637290954,
519
+ "num_tokens": 10477632.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.9538533687591553,
524
+ "epoch": 0.9576427255985267,
525
+ "grad_norm": 0.6317242980003357,
526
+ "learning_rate": 9.943525585653428e-05,
527
+ "loss": 0.09542192220687866,
528
+ "mean_token_accuracy": 0.9635261118412017,
529
+ "num_tokens": 10682828.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.9362513542175293,
534
+ "epoch": 0.9760589318600368,
535
+ "grad_norm": 0.6421944499015808,
536
+ "learning_rate": 9.938869720229234e-05,
537
+ "loss": 0.09382058382034301,
538
+ "mean_token_accuracy": 0.9648073971271515,
539
+ "num_tokens": 10888741.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.9235438346862793,
544
+ "epoch": 0.994475138121547,
545
+ "grad_norm": 0.7986873388290405,
546
+ "learning_rate": 9.934030638847155e-05,
547
+ "loss": 0.09827429056167603,
548
+ "mean_token_accuracy": 0.9621128737926483,
549
+ "num_tokens": 11094387.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "epoch": 1.0,
554
+ "eval_entropy": 0.9137652366057686,
555
+ "eval_loss": 0.09368764609098434,
556
+ "eval_mean_token_accuracy": 0.9640816880309063,
557
+ "eval_num_tokens": 11155908.0,
558
+ "eval_runtime": 10.4701,
559
+ "eval_samples_per_second": 349.377,
560
+ "eval_steps_per_second": 10.984,
561
+ "step": 543
562
+ },
563
+ {
564
+ "entropy": 0.9047818422317505,
565
+ "epoch": 1.0128913443830572,
566
+ "grad_norm": 0.6781501173973083,
567
+ "learning_rate": 9.929008521021325e-05,
568
+ "loss": 0.0863916516304016,
569
+ "mean_token_accuracy": 0.9673655688762665,
570
+ "num_tokens": 11299715.0,
571
+ "step": 550
572
+ },
573
+ {
574
+ "entropy": 0.8856981039047241,
575
+ "epoch": 1.0313075506445673,
576
+ "grad_norm": 0.7143136858940125,
577
+ "learning_rate": 9.923803553055937e-05,
578
+ "loss": 0.08632323145866394,
579
+ "mean_token_accuracy": 0.9677783191204071,
580
+ "num_tokens": 11505059.0,
581
+ "step": 560
582
+ },
583
+ {
584
+ "entropy": 0.8937099635601043,
585
+ "epoch": 1.0497237569060773,
586
+ "grad_norm": 0.7751694321632385,
587
+ "learning_rate": 9.918415928038325e-05,
588
+ "loss": 0.08178263902664185,
589
+ "mean_token_accuracy": 0.9694291114807129,
590
+ "num_tokens": 11710464.0,
591
+ "step": 570
592
+ },
593
+ {
594
+ "entropy": 0.8858704209327698,
595
+ "epoch": 1.0681399631675874,
596
+ "grad_norm": 0.7492292523384094,
597
+ "learning_rate": 9.912845845831805e-05,
598
+ "loss": 0.08074211478233337,
599
+ "mean_token_accuracy": 0.9692470014095307,
600
+ "num_tokens": 11915959.0,
601
+ "step": 580
602
+ },
603
+ {
604
+ "entropy": 0.8948039829730987,
605
+ "epoch": 1.0865561694290977,
606
+ "grad_norm": 0.8116479516029358,
607
+ "learning_rate": 9.907093513068259e-05,
608
+ "loss": 0.08712012171745301,
609
+ "mean_token_accuracy": 0.9669980227947235,
610
+ "num_tokens": 12121499.0,
611
+ "step": 590
612
+ },
613
+ {
614
+ "entropy": 0.8846789538860321,
615
+ "epoch": 1.1049723756906078,
616
+ "grad_norm": 0.7295626997947693,
617
+ "learning_rate": 9.901159143140471e-05,
618
+ "loss": 0.08444435596466064,
619
+ "mean_token_accuracy": 0.9674544095993042,
620
+ "num_tokens": 12327061.0,
621
+ "step": 600
622
+ },
623
+ {
624
+ "entropy": 0.8734103918075562,
625
+ "epoch": 1.1233885819521179,
626
+ "grad_norm": 0.9585768580436707,
627
+ "learning_rate": 9.89504295619421e-05,
628
+ "loss": 0.08022565841674804,
629
+ "mean_token_accuracy": 0.969569206237793,
630
+ "num_tokens": 12532305.0,
631
+ "step": 610
632
+ },
633
+ {
634
+ "entropy": 0.8640486001968384,
635
+ "epoch": 1.141804788213628,
636
+ "grad_norm": 0.7891159057617188,
637
+ "learning_rate": 9.88874517912006e-05,
638
+ "loss": 0.08415375947952271,
639
+ "mean_token_accuracy": 0.9678892493247986,
640
+ "num_tokens": 12737828.0,
641
+ "step": 620
642
+ },
643
+ {
644
+ "entropy": 0.8599755525588989,
645
+ "epoch": 1.160220994475138,
646
+ "grad_norm": 0.5801345109939575,
647
+ "learning_rate": 9.882266045545012e-05,
648
+ "loss": 0.08100489974021911,
649
+ "mean_token_accuracy": 0.9688023269176483,
650
+ "num_tokens": 12943343.0,
651
+ "step": 630
652
+ },
653
+ {
654
+ "entropy": 0.86524977684021,
655
+ "epoch": 1.1786372007366483,
656
+ "grad_norm": 0.7633041143417358,
657
+ "learning_rate": 9.87560579582379e-05,
658
+ "loss": 0.07859406471252442,
659
+ "mean_token_accuracy": 0.9702189445495606,
660
+ "num_tokens": 13148473.0,
661
+ "step": 640
662
+ },
663
+ {
664
+ "entropy": 0.8466695249080658,
665
+ "epoch": 1.1970534069981584,
666
+ "grad_norm": 0.8672215938568115,
667
+ "learning_rate": 9.868764677029934e-05,
668
+ "loss": 0.08082623481750488,
669
+ "mean_token_accuracy": 0.9689972400665283,
670
+ "num_tokens": 13353890.0,
671
+ "step": 650
672
+ },
673
+ {
674
+ "entropy": 0.8596941530704498,
675
+ "epoch": 1.2154696132596685,
676
+ "grad_norm": 0.7524124383926392,
677
+ "learning_rate": 9.861742942946639e-05,
678
+ "loss": 0.0789935290813446,
679
+ "mean_token_accuracy": 0.9693858206272126,
680
+ "num_tokens": 13559475.0,
681
+ "step": 660
682
+ },
683
+ {
684
+ "entropy": 0.8708749234676361,
685
+ "epoch": 1.2338858195211786,
686
+ "grad_norm": 0.5777031183242798,
687
+ "learning_rate": 9.854540854057337e-05,
688
+ "loss": 0.07773642539978028,
689
+ "mean_token_accuracy": 0.970385092496872,
690
+ "num_tokens": 13765076.0,
691
+ "step": 670
692
+ },
693
+ {
694
+ "entropy": 0.8651713371276856,
695
+ "epoch": 1.2523020257826887,
696
+ "grad_norm": 0.7924166321754456,
697
+ "learning_rate": 9.847158677536034e-05,
698
+ "loss": 0.0766686737537384,
699
+ "mean_token_accuracy": 0.9702267110347748,
700
+ "num_tokens": 13970642.0,
701
+ "step": 680
702
+ },
703
+ {
704
+ "entropy": 0.8763024985790253,
705
+ "epoch": 1.270718232044199,
706
+ "grad_norm": 0.741219162940979,
707
+ "learning_rate": 9.839596687237403e-05,
708
+ "loss": 0.07189929485321045,
709
+ "mean_token_accuracy": 0.9727097094058991,
710
+ "num_tokens": 14176556.0,
711
+ "step": 690
712
+ },
713
+ {
714
+ "entropy": 0.8556921362876893,
715
+ "epoch": 1.289134438305709,
716
+ "grad_norm": 0.6298198103904724,
717
+ "learning_rate": 9.831855163686618e-05,
718
+ "loss": 0.07608137726783752,
719
+ "mean_token_accuracy": 0.9716399371623993,
720
+ "num_tokens": 14381686.0,
721
+ "step": 700
722
+ },
723
+ {
724
+ "entropy": 0.869178420305252,
725
+ "epoch": 1.3075506445672191,
726
+ "grad_norm": 0.5850273370742798,
727
+ "learning_rate": 9.823934394068952e-05,
728
+ "loss": 0.07437651753425598,
729
+ "mean_token_accuracy": 0.9709566533565521,
730
+ "num_tokens": 14586814.0,
731
+ "step": 710
732
+ },
733
+ {
734
+ "entropy": 0.8708595156669616,
735
+ "epoch": 1.3259668508287292,
736
+ "grad_norm": 0.6580632328987122,
737
+ "learning_rate": 9.815834672219127e-05,
738
+ "loss": 0.07518917322158813,
739
+ "mean_token_accuracy": 0.9717426657676697,
740
+ "num_tokens": 14792321.0,
741
+ "step": 720
742
+ },
743
+ {
744
+ "entropy": 0.8826817810535431,
745
+ "epoch": 1.3443830570902393,
746
+ "grad_norm": 0.8788532018661499,
747
+ "learning_rate": 9.807556298610404e-05,
748
+ "loss": 0.07579240798950196,
749
+ "mean_token_accuracy": 0.9706341981887817,
750
+ "num_tokens": 14997810.0,
751
+ "step": 730
752
+ },
753
+ {
754
+ "entropy": 0.9012470185756684,
755
+ "epoch": 1.3627992633517496,
756
+ "grad_norm": 0.7022138237953186,
757
+ "learning_rate": 9.799099580343441e-05,
758
+ "loss": 0.0775588572025299,
759
+ "mean_token_accuracy": 0.9699241399765015,
760
+ "num_tokens": 15203795.0,
761
+ "step": 740
762
+ },
763
+ {
764
+ "entropy": 0.886955714225769,
765
+ "epoch": 1.3812154696132597,
766
+ "grad_norm": 0.7881133556365967,
767
+ "learning_rate": 9.790464831134903e-05,
768
+ "loss": 0.07125020027160645,
769
+ "mean_token_accuracy": 0.9723815560340882,
770
+ "num_tokens": 15408974.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9047374844551086,
775
+ "epoch": 1.3996316758747698,
776
+ "grad_norm": 0.9082005023956299,
777
+ "learning_rate": 9.781652371305824e-05,
778
+ "loss": 0.07004334926605224,
779
+ "mean_token_accuracy": 0.9725580036640167,
780
+ "num_tokens": 15614399.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9039053857326508,
785
+ "epoch": 1.4180478821362799,
786
+ "grad_norm": 0.8060817122459412,
787
+ "learning_rate": 9.77266252776972e-05,
788
+ "loss": 0.07103485465049744,
789
+ "mean_token_accuracy": 0.9721468150615692,
790
+ "num_tokens": 15819895.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.8998047232627868,
795
+ "epoch": 1.43646408839779,
796
+ "grad_norm": 1.0152642726898193,
797
+ "learning_rate": 9.763495634020467e-05,
798
+ "loss": 0.07411704063415528,
799
+ "mean_token_accuracy": 0.9711063146591187,
800
+ "num_tokens": 16025297.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.9120213568210602,
805
+ "epoch": 1.4548802946593002,
806
+ "grad_norm": 0.6288319826126099,
807
+ "learning_rate": 9.754152030119921e-05,
808
+ "loss": 0.07223712205886841,
809
+ "mean_token_accuracy": 0.9722476422786712,
810
+ "num_tokens": 16230656.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.9142370820045471,
815
+ "epoch": 1.4732965009208103,
816
+ "grad_norm": 0.7854700088500977,
817
+ "learning_rate": 9.744632062685311e-05,
818
+ "loss": 0.07186744809150696,
819
+ "mean_token_accuracy": 0.972247713804245,
820
+ "num_tokens": 16435943.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.8920814216136932,
825
+ "epoch": 1.4917127071823204,
826
+ "grad_norm": 0.6227074265480042,
827
+ "learning_rate": 9.734936084876383e-05,
828
+ "loss": 0.07016961574554444,
829
+ "mean_token_accuracy": 0.9725603640079499,
830
+ "num_tokens": 16641635.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.891328877210617,
835
+ "epoch": 1.5101289134438307,
836
+ "grad_norm": 0.7601346969604492,
837
+ "learning_rate": 9.725064456382283e-05,
838
+ "loss": 0.07137494087219239,
839
+ "mean_token_accuracy": 0.9722997546195984,
840
+ "num_tokens": 16847194.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.8921217978000641,
845
+ "epoch": 1.5285451197053406,
846
+ "grad_norm": 0.7813850045204163,
847
+ "learning_rate": 9.715017543408233e-05,
848
+ "loss": 0.06890199184417725,
849
+ "mean_token_accuracy": 0.9735044002532959,
850
+ "num_tokens": 17052807.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.9085914671421051,
855
+ "epoch": 1.5469613259668509,
856
+ "grad_norm": 0.6184289455413818,
857
+ "learning_rate": 9.704795718661939e-05,
858
+ "loss": 0.07043765187263488,
859
+ "mean_token_accuracy": 0.9725716531276702,
860
+ "num_tokens": 17258284.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9029861629009247,
865
+ "epoch": 1.565377532228361,
866
+ "grad_norm": 0.7082377076148987,
867
+ "learning_rate": 9.694399361339752e-05,
868
+ "loss": 0.07113839387893676,
869
+ "mean_token_accuracy": 0.9725669205188752,
870
+ "num_tokens": 17464326.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.8856533527374267,
875
+ "epoch": 1.583793738489871,
876
+ "grad_norm": 0.7409216165542603,
877
+ "learning_rate": 9.683828857112627e-05,
878
+ "loss": 0.07077333331108093,
879
+ "mean_token_accuracy": 0.9731084644794464,
880
+ "num_tokens": 17669537.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.8613030433654785,
885
+ "epoch": 1.6022099447513813,
886
+ "grad_norm": 0.6801561713218689,
887
+ "learning_rate": 9.673084598111789e-05,
888
+ "loss": 0.06885308027267456,
889
+ "mean_token_accuracy": 0.97266526222229,
890
+ "num_tokens": 17875289.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.8692965865135193,
895
+ "epoch": 1.6206261510128912,
896
+ "grad_norm": 1.1621277332305908,
897
+ "learning_rate": 9.662166982914203e-05,
898
+ "loss": 0.07017780542373657,
899
+ "mean_token_accuracy": 0.9733059942722321,
900
+ "num_tokens": 18080404.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.8671502113342285,
905
+ "epoch": 1.6390423572744015,
906
+ "grad_norm": 0.7518903613090515,
907
+ "learning_rate": 9.651076416527787e-05,
908
+ "loss": 0.06977018713951111,
909
+ "mean_token_accuracy": 0.9730017304420471,
910
+ "num_tokens": 18285699.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.8662045657634735,
915
+ "epoch": 1.6574585635359116,
916
+ "grad_norm": 0.6622698903083801,
917
+ "learning_rate": 9.639813310376378e-05,
918
+ "loss": 0.06620995998382569,
919
+ "mean_token_accuracy": 0.9737491130828857,
920
+ "num_tokens": 18491097.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.8548173069953918,
925
+ "epoch": 1.6758747697974217,
926
+ "grad_norm": 0.8941843509674072,
927
+ "learning_rate": 9.628378082284479e-05,
928
+ "loss": 0.06711119413375854,
929
+ "mean_token_accuracy": 0.9740589797496796,
930
+ "num_tokens": 18696827.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.8763562262058258,
935
+ "epoch": 1.694290976058932,
936
+ "grad_norm": 0.7571700215339661,
937
+ "learning_rate": 9.616771156461755e-05,
938
+ "loss": 0.07263468503952027,
939
+ "mean_token_accuracy": 0.9717419981956482,
940
+ "num_tokens": 18902513.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.8663733780384064,
945
+ "epoch": 1.7127071823204418,
946
+ "grad_norm": 0.7886489629745483,
947
+ "learning_rate": 9.604992963487298e-05,
948
+ "loss": 0.07074605226516724,
949
+ "mean_token_accuracy": 0.9724965393543243,
950
+ "num_tokens": 19107812.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.8673004627227783,
955
+ "epoch": 1.7311233885819521,
956
+ "grad_norm": 0.8180726170539856,
957
+ "learning_rate": 9.593043940293647e-05,
958
+ "loss": 0.06831735372543335,
959
+ "mean_token_accuracy": 0.9733696818351746,
960
+ "num_tokens": 19313330.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.8525971233844757,
965
+ "epoch": 1.7495395948434622,
966
+ "grad_norm": 0.6576228737831116,
967
+ "learning_rate": 9.580924530150595e-05,
968
+ "loss": 0.06567002534866333,
969
+ "mean_token_accuracy": 0.9745754361152649,
970
+ "num_tokens": 19518671.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.8605451703071594,
975
+ "epoch": 1.7679558011049723,
976
+ "grad_norm": 0.7171661257743835,
977
+ "learning_rate": 9.568635182648725e-05,
978
+ "loss": 0.06872050762176514,
979
+ "mean_token_accuracy": 0.9732091546058654,
980
+ "num_tokens": 19724135.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.8642210960388184,
985
+ "epoch": 1.7863720073664826,
986
+ "grad_norm": 0.7603147029876709,
987
+ "learning_rate": 9.556176353682746e-05,
988
+ "loss": 0.06766576766967773,
989
+ "mean_token_accuracy": 0.9728681743144989,
990
+ "num_tokens": 19928785.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.8543185651302337,
995
+ "epoch": 1.8047882136279927,
996
+ "grad_norm": 0.7280875444412231,
997
+ "learning_rate": 9.543548505434581e-05,
998
+ "loss": 0.06851862668991089,
999
+ "mean_token_accuracy": 0.9737437188625335,
1000
+ "num_tokens": 20134195.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.8744745373725891,
1005
+ "epoch": 1.8232044198895028,
1006
+ "grad_norm": 0.5897248983383179,
1007
+ "learning_rate": 9.530752106356209e-05,
1008
+ "loss": 0.06809053421020508,
1009
+ "mean_token_accuracy": 0.9733593761920929,
1010
+ "num_tokens": 20339517.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.8623859465122223,
1015
+ "epoch": 1.8416206261510129,
1016
+ "grad_norm": 0.7515265345573425,
1017
+ "learning_rate": 9.517787631152298e-05,
1018
+ "loss": 0.07257847785949707,
1019
+ "mean_token_accuracy": 0.9714054942131043,
1020
+ "num_tokens": 20545249.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.8669404804706573,
1025
+ "epoch": 1.860036832412523,
1026
+ "grad_norm": 0.7144560813903809,
1027
+ "learning_rate": 9.504655560762596e-05,
1028
+ "loss": 0.06832354068756104,
1029
+ "mean_token_accuracy": 0.9735779523849487,
1030
+ "num_tokens": 20750507.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.8493516445159912,
1035
+ "epoch": 1.8784530386740332,
1036
+ "grad_norm": 0.6559189558029175,
1037
+ "learning_rate": 9.491356382344081e-05,
1038
+ "loss": 0.0629766047000885,
1039
+ "mean_token_accuracy": 0.9754977762699127,
1040
+ "num_tokens": 20955956.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.8599376022815705,
1045
+ "epoch": 1.8968692449355433,
1046
+ "grad_norm": 0.6792973279953003,
1047
+ "learning_rate": 9.477890589252895e-05,
1048
+ "loss": 0.0666757881641388,
1049
+ "mean_token_accuracy": 0.974083811044693,
1050
+ "num_tokens": 21161163.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.8458438158035279,
1055
+ "epoch": 1.9152854511970534,
1056
+ "grad_norm": 0.6941778659820557,
1057
+ "learning_rate": 9.464258681026042e-05,
1058
+ "loss": 0.06307152509689332,
1059
+ "mean_token_accuracy": 0.9757042229175568,
1060
+ "num_tokens": 21366525.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.848515909910202,
1065
+ "epoch": 1.9337016574585635,
1066
+ "grad_norm": 0.7307806611061096,
1067
+ "learning_rate": 9.450461163362855e-05,
1068
+ "loss": 0.06307026147842407,
1069
+ "mean_token_accuracy": 0.9750974595546722,
1070
+ "num_tokens": 21572238.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.8563454031944275,
1075
+ "epoch": 1.9521178637200736,
1076
+ "grad_norm": 0.7222106456756592,
1077
+ "learning_rate": 9.436498548106236e-05,
1078
+ "loss": 0.0647726058959961,
1079
+ "mean_token_accuracy": 0.974629694223404,
1080
+ "num_tokens": 21777633.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.8656457483768463,
1085
+ "epoch": 1.9705340699815839,
1086
+ "grad_norm": 0.67178875207901,
1087
+ "learning_rate": 9.422371353223674e-05,
1088
+ "loss": 0.06573554277420043,
1089
+ "mean_token_accuracy": 0.9745908617973328,
1090
+ "num_tokens": 21983116.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.8630891263484954,
1095
+ "epoch": 1.988950276243094,
1096
+ "grad_norm": 0.6956593990325928,
1097
+ "learning_rate": 9.408080102788016e-05,
1098
+ "loss": 0.06630704402923585,
1099
+ "mean_token_accuracy": 0.9741333484649658,
1100
+ "num_tokens": 22188662.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "epoch": 2.0,
1105
+ "eval_entropy": 0.8560857042022373,
1106
+ "eval_loss": 0.06494329869747162,
1107
+ "eval_mean_token_accuracy": 0.9745692672936813,
1108
+ "eval_num_tokens": 22311800.0,
1109
+ "eval_runtime": 10.129,
1110
+ "eval_samples_per_second": 361.142,
1111
+ "eval_steps_per_second": 11.354,
1112
+ "step": 1086
1113
+ }
1114
+ ],
1115
+ "logging_steps": 10,
1116
+ "max_steps": 5430,
1117
+ "num_input_tokens_seen": 0,
1118
+ "num_train_epochs": 10,
1119
+ "save_steps": 500,
1120
+ "stateful_callbacks": {
1121
+ "TrainerControl": {
1122
+ "args": {
1123
+ "should_epoch_stop": false,
1124
+ "should_evaluate": false,
1125
+ "should_log": false,
1126
+ "should_save": true,
1127
+ "should_training_stop": false
1128
+ },
1129
+ "attributes": {}
1130
+ }
1131
+ },
1132
+ "total_flos": 1.0639691635941704e+18,
1133
+ "train_batch_size": 32,
1134
+ "trial_name": null,
1135
+ "trial_params": null
1136
+ }
checkpoint-1086/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75
3
+ size 5777
checkpoint-1629/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-1629/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-1629/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf4459fd4ce731043eb056554bc1c81afea162e43afc91a4e21906da57bbdc0
3
+ size 80792096
checkpoint-1629/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-1629/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-1629/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-1629/trainer_state.json ADDED
@@ -0,0 +1,1687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1629,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2237394809722901,
14
+ "epoch": 0.01841620626151013,
15
+ "grad_norm": 5.082435607910156,
16
+ "learning_rate": 3.308823529411765e-06,
17
+ "loss": 0.9237876892089844,
18
+ "mean_token_accuracy": 0.7685343027114868,
19
+ "num_tokens": 205423.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2295925617218018,
24
+ "epoch": 0.03683241252302026,
25
+ "grad_norm": 4.672000408172607,
26
+ "learning_rate": 6.985294117647059e-06,
27
+ "loss": 0.8900892257690429,
28
+ "mean_token_accuracy": 0.7677771031856537,
29
+ "num_tokens": 410849.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2285718679428101,
34
+ "epoch": 0.055248618784530384,
35
+ "grad_norm": 1.4828118085861206,
36
+ "learning_rate": 1.0661764705882354e-05,
37
+ "loss": 0.5975452899932862,
38
+ "mean_token_accuracy": 0.8146551787853241,
39
+ "num_tokens": 616438.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.210776400566101,
44
+ "epoch": 0.07366482504604052,
45
+ "grad_norm": 0.7761328816413879,
46
+ "learning_rate": 1.4338235294117647e-05,
47
+ "loss": 0.40664992332458494,
48
+ "mean_token_accuracy": 0.8699092030525207,
49
+ "num_tokens": 822118.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.200321125984192,
54
+ "epoch": 0.09208103130755065,
55
+ "grad_norm": 0.5363371968269348,
56
+ "learning_rate": 1.8014705882352943e-05,
57
+ "loss": 0.3313469409942627,
58
+ "mean_token_accuracy": 0.8904915869235992,
59
+ "num_tokens": 1027941.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.1809936046600342,
64
+ "epoch": 0.11049723756906077,
65
+ "grad_norm": 0.39541518688201904,
66
+ "learning_rate": 2.1691176470588237e-05,
67
+ "loss": 0.27568228244781495,
68
+ "mean_token_accuracy": 0.9047131836414337,
69
+ "num_tokens": 1233620.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.169810914993286,
74
+ "epoch": 0.1289134438305709,
75
+ "grad_norm": 0.341960072517395,
76
+ "learning_rate": 2.536764705882353e-05,
77
+ "loss": 0.245219087600708,
78
+ "mean_token_accuracy": 0.9150686681270599,
79
+ "num_tokens": 1438656.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.1652960777282715,
84
+ "epoch": 0.14732965009208104,
85
+ "grad_norm": 0.36872178316116333,
86
+ "learning_rate": 2.9044117647058828e-05,
87
+ "loss": 0.2220149040222168,
88
+ "mean_token_accuracy": 0.9224777698516846,
89
+ "num_tokens": 1643877.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.154341197013855,
94
+ "epoch": 0.16574585635359115,
95
+ "grad_norm": 0.4152425229549408,
96
+ "learning_rate": 3.272058823529412e-05,
97
+ "loss": 0.2002798557281494,
98
+ "mean_token_accuracy": 0.9285802960395813,
99
+ "num_tokens": 1849506.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.1507258892059327,
104
+ "epoch": 0.1841620626151013,
105
+ "grad_norm": 0.47647765278816223,
106
+ "learning_rate": 3.639705882352941e-05,
107
+ "loss": 0.18871363401412963,
108
+ "mean_token_accuracy": 0.9318056285381318,
109
+ "num_tokens": 2055071.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.1455535531044005,
114
+ "epoch": 0.20257826887661143,
115
+ "grad_norm": 0.4853009581565857,
116
+ "learning_rate": 4.007352941176471e-05,
117
+ "loss": 0.17836341857910157,
118
+ "mean_token_accuracy": 0.9367631554603577,
119
+ "num_tokens": 2260643.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.1402526497840881,
124
+ "epoch": 0.22099447513812154,
125
+ "grad_norm": 0.4455392360687256,
126
+ "learning_rate": 4.375e-05,
127
+ "loss": 0.16921783685684205,
128
+ "mean_token_accuracy": 0.9386959195137023,
129
+ "num_tokens": 2466085.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.1374777555465698,
134
+ "epoch": 0.23941068139963168,
135
+ "grad_norm": 0.5880279541015625,
136
+ "learning_rate": 4.742647058823529e-05,
137
+ "loss": 0.15989291667938232,
138
+ "mean_token_accuracy": 0.9421182632446289,
139
+ "num_tokens": 2671024.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1273940205574036,
144
+ "epoch": 0.2578268876611418,
145
+ "grad_norm": 0.612959086894989,
146
+ "learning_rate": 5.110294117647059e-05,
147
+ "loss": 0.14701461791992188,
148
+ "mean_token_accuracy": 0.9463540315628052,
149
+ "num_tokens": 2876848.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1263513088226318,
154
+ "epoch": 0.27624309392265195,
155
+ "grad_norm": 0.5695255398750305,
156
+ "learning_rate": 5.477941176470589e-05,
157
+ "loss": 0.14604382514953612,
158
+ "mean_token_accuracy": 0.946351945400238,
159
+ "num_tokens": 3082589.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1290789365768432,
164
+ "epoch": 0.2946593001841621,
165
+ "grad_norm": 0.6608090996742249,
166
+ "learning_rate": 5.845588235294118e-05,
167
+ "loss": 0.1409450054168701,
168
+ "mean_token_accuracy": 0.9481450319290161,
169
+ "num_tokens": 3287459.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1291529774665832,
174
+ "epoch": 0.31307550644567217,
175
+ "grad_norm": 0.652715802192688,
176
+ "learning_rate": 6.213235294117647e-05,
177
+ "loss": 0.14441155195236205,
178
+ "mean_token_accuracy": 0.9466125547885895,
179
+ "num_tokens": 3493682.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1244838953018188,
184
+ "epoch": 0.3314917127071823,
185
+ "grad_norm": 0.7815241813659668,
186
+ "learning_rate": 6.580882352941177e-05,
187
+ "loss": 0.13361064195632935,
188
+ "mean_token_accuracy": 0.9512295544147491,
189
+ "num_tokens": 3699573.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1217721104621887,
194
+ "epoch": 0.34990791896869244,
195
+ "grad_norm": 0.7933160066604614,
196
+ "learning_rate": 6.948529411764706e-05,
197
+ "loss": 0.13089522123336791,
198
+ "mean_token_accuracy": 0.9520221531391144,
199
+ "num_tokens": 3905156.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1206679105758668,
204
+ "epoch": 0.3683241252302026,
205
+ "grad_norm": 0.6815240383148193,
206
+ "learning_rate": 7.316176470588236e-05,
207
+ "loss": 0.13400404453277587,
208
+ "mean_token_accuracy": 0.9501322209835052,
209
+ "num_tokens": 4110570.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1161052227020263,
214
+ "epoch": 0.3867403314917127,
215
+ "grad_norm": 0.8297767639160156,
216
+ "learning_rate": 7.683823529411766e-05,
217
+ "loss": 0.13389937877655028,
218
+ "mean_token_accuracy": 0.9501932203769684,
219
+ "num_tokens": 4315834.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1098745942115784,
224
+ "epoch": 0.40515653775322286,
225
+ "grad_norm": 0.5943381786346436,
226
+ "learning_rate": 8.051470588235294e-05,
227
+ "loss": 0.13452907800674438,
228
+ "mean_token_accuracy": 0.9503286242485046,
229
+ "num_tokens": 4520807.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.100480353832245,
234
+ "epoch": 0.42357274401473294,
235
+ "grad_norm": 0.6094359755516052,
236
+ "learning_rate": 8.419117647058824e-05,
237
+ "loss": 0.12827746868133544,
238
+ "mean_token_accuracy": 0.952492094039917,
239
+ "num_tokens": 4725867.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.0901286959648133,
244
+ "epoch": 0.4419889502762431,
245
+ "grad_norm": 0.7240597605705261,
246
+ "learning_rate": 8.786764705882353e-05,
247
+ "loss": 0.12171242237091065,
248
+ "mean_token_accuracy": 0.953943532705307,
249
+ "num_tokens": 4931629.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.0885071873664856,
254
+ "epoch": 0.4604051565377532,
255
+ "grad_norm": 0.6939547657966614,
256
+ "learning_rate": 9.154411764705882e-05,
257
+ "loss": 0.12155698537826538,
258
+ "mean_token_accuracy": 0.9545870959758759,
259
+ "num_tokens": 5137285.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.086272156238556,
264
+ "epoch": 0.47882136279926335,
265
+ "grad_norm": 0.5752800703048706,
266
+ "learning_rate": 9.522058823529412e-05,
267
+ "loss": 0.12157790660858155,
268
+ "mean_token_accuracy": 0.9541126549243927,
269
+ "num_tokens": 5342575.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.0857678413391114,
274
+ "epoch": 0.4972375690607735,
275
+ "grad_norm": 0.7565123438835144,
276
+ "learning_rate": 9.889705882352942e-05,
277
+ "loss": 0.12349612712860107,
278
+ "mean_token_accuracy": 0.9535140514373779,
279
+ "num_tokens": 5547995.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.079762625694275,
284
+ "epoch": 0.5156537753222836,
285
+ "grad_norm": 0.6972768306732178,
286
+ "learning_rate": 9.999954556423843e-05,
287
+ "loss": 0.11875582933425903,
288
+ "mean_token_accuracy": 0.9556483089923858,
289
+ "num_tokens": 5753195.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.0742079138755798,
294
+ "epoch": 0.5340699815837937,
295
+ "grad_norm": 0.7821696996688843,
296
+ "learning_rate": 9.999731977631227e-05,
297
+ "loss": 0.11824090480804443,
298
+ "mean_token_accuracy": 0.9557521045207977,
299
+ "num_tokens": 5958236.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.0679773569107056,
304
+ "epoch": 0.5524861878453039,
305
+ "grad_norm": 0.5846888422966003,
306
+ "learning_rate": 9.999323925089486e-05,
307
+ "loss": 0.11707355976104736,
308
+ "mean_token_accuracy": 0.9554719448089599,
309
+ "num_tokens": 6163992.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.0655727863311768,
314
+ "epoch": 0.570902394106814,
315
+ "grad_norm": 0.5812502503395081,
316
+ "learning_rate": 9.998730413936037e-05,
317
+ "loss": 0.11371417045593261,
318
+ "mean_token_accuracy": 0.9576376020908356,
319
+ "num_tokens": 6369456.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.0607039332389832,
324
+ "epoch": 0.5893186003683242,
325
+ "grad_norm": 0.6238475441932678,
326
+ "learning_rate": 9.99795146618821e-05,
327
+ "loss": 0.11775733232498169,
328
+ "mean_token_accuracy": 0.9557221591472626,
329
+ "num_tokens": 6574833.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.0504255175590516,
334
+ "epoch": 0.6077348066298343,
335
+ "grad_norm": 0.6496815085411072,
336
+ "learning_rate": 9.996987110742422e-05,
337
+ "loss": 0.10904088020324706,
338
+ "mean_token_accuracy": 0.9585366368293762,
339
+ "num_tokens": 6780108.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.0456081986427308,
344
+ "epoch": 0.6261510128913443,
345
+ "grad_norm": 0.786702573299408,
346
+ "learning_rate": 9.995837383373119e-05,
347
+ "loss": 0.10642309188842773,
348
+ "mean_token_accuracy": 0.9596696078777314,
349
+ "num_tokens": 6985920.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.0455098271369934,
354
+ "epoch": 0.6445672191528545,
355
+ "grad_norm": 0.5473790168762207,
356
+ "learning_rate": 9.994502326731434e-05,
357
+ "loss": 0.10822961330413819,
358
+ "mean_token_accuracy": 0.959563136100769,
359
+ "num_tokens": 7191465.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.04240562915802,
364
+ "epoch": 0.6629834254143646,
365
+ "grad_norm": 0.6672356128692627,
366
+ "learning_rate": 9.992981990343614e-05,
367
+ "loss": 0.1110004186630249,
368
+ "mean_token_accuracy": 0.9582514643669129,
369
+ "num_tokens": 7396877.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0386811256408692,
374
+ "epoch": 0.6813996316758748,
375
+ "grad_norm": 0.698539674282074,
376
+ "learning_rate": 9.99127643060918e-05,
377
+ "loss": 0.107539963722229,
378
+ "mean_token_accuracy": 0.9593036234378814,
379
+ "num_tokens": 7602437.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.0311225533485413,
384
+ "epoch": 0.6998158379373849,
385
+ "grad_norm": 0.6629284024238586,
386
+ "learning_rate": 9.989385710798837e-05,
387
+ "loss": 0.1064023494720459,
388
+ "mean_token_accuracy": 0.9602205216884613,
389
+ "num_tokens": 7808142.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.030210506916046,
394
+ "epoch": 0.7182320441988951,
395
+ "grad_norm": 0.5616748929023743,
396
+ "learning_rate": 9.987309901052121e-05,
397
+ "loss": 0.10717041492462158,
398
+ "mean_token_accuracy": 0.9599347949028015,
399
+ "num_tokens": 8013407.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0208017826080322,
404
+ "epoch": 0.7366482504604052,
405
+ "grad_norm": 0.6329049468040466,
406
+ "learning_rate": 9.985049078374806e-05,
407
+ "loss": 0.10359601974487305,
408
+ "mean_token_accuracy": 0.9603756129741668,
409
+ "num_tokens": 8219040.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.015640377998352,
414
+ "epoch": 0.7550644567219152,
415
+ "grad_norm": 0.6516013741493225,
416
+ "learning_rate": 9.982603326636037e-05,
417
+ "loss": 0.10146439075469971,
418
+ "mean_token_accuracy": 0.9627702474594116,
419
+ "num_tokens": 8424678.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0105359435081482,
424
+ "epoch": 0.7734806629834254,
425
+ "grad_norm": 0.6920603513717651,
426
+ "learning_rate": 9.979972736565226e-05,
427
+ "loss": 0.10770498514175415,
428
+ "mean_token_accuracy": 0.9591470420360565,
429
+ "num_tokens": 8629868.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.9966452836990356,
434
+ "epoch": 0.7918968692449355,
435
+ "grad_norm": 0.6857476234436035,
436
+ "learning_rate": 9.977157405748687e-05,
437
+ "loss": 0.10282524824142455,
438
+ "mean_token_accuracy": 0.9612209022045135,
439
+ "num_tokens": 8835320.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.9945534646511078,
444
+ "epoch": 0.8103130755064457,
445
+ "grad_norm": 0.7208472490310669,
446
+ "learning_rate": 9.974157438626008e-05,
447
+ "loss": 0.10069938898086547,
448
+ "mean_token_accuracy": 0.9620070576667785,
449
+ "num_tokens": 9041123.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.979461395740509,
454
+ "epoch": 0.8287292817679558,
455
+ "grad_norm": 0.5071915984153748,
456
+ "learning_rate": 9.970972946486185e-05,
457
+ "loss": 0.09799174070358277,
458
+ "mean_token_accuracy": 0.9620374023914338,
459
+ "num_tokens": 9246361.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.9830998003482818,
464
+ "epoch": 0.8471454880294659,
465
+ "grad_norm": 0.8660802245140076,
466
+ "learning_rate": 9.967604047463493e-05,
467
+ "loss": 0.10378165245056152,
468
+ "mean_token_accuracy": 0.9606865763664245,
469
+ "num_tokens": 9451845.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.9813413023948669,
474
+ "epoch": 0.8655616942909761,
475
+ "grad_norm": 0.7642477750778198,
476
+ "learning_rate": 9.964050866533094e-05,
477
+ "loss": 0.1010061264038086,
478
+ "mean_token_accuracy": 0.9608745336532593,
479
+ "num_tokens": 9656802.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.967874163389206,
484
+ "epoch": 0.8839779005524862,
485
+ "grad_norm": 0.5987281799316406,
486
+ "learning_rate": 9.960313535506411e-05,
487
+ "loss": 0.10169394016265869,
488
+ "mean_token_accuracy": 0.9611998200416565,
489
+ "num_tokens": 9861719.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.9663491308689117,
494
+ "epoch": 0.9023941068139963,
495
+ "grad_norm": 0.6124638319015503,
496
+ "learning_rate": 9.956392193026239e-05,
497
+ "loss": 0.102389657497406,
498
+ "mean_token_accuracy": 0.9611884355545044,
499
+ "num_tokens": 10066673.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.959654438495636,
504
+ "epoch": 0.9208103130755064,
505
+ "grad_norm": 0.7873051762580872,
506
+ "learning_rate": 9.952286984561592e-05,
507
+ "loss": 0.10170392990112305,
508
+ "mean_token_accuracy": 0.9610928475856781,
509
+ "num_tokens": 10272091.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.9550537407398224,
514
+ "epoch": 0.9392265193370166,
515
+ "grad_norm": 0.6071968078613281,
516
+ "learning_rate": 9.947998062402313e-05,
517
+ "loss": 0.09448277950286865,
518
+ "mean_token_accuracy": 0.9648977637290954,
519
+ "num_tokens": 10477632.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.9538533687591553,
524
+ "epoch": 0.9576427255985267,
525
+ "grad_norm": 0.6317242980003357,
526
+ "learning_rate": 9.943525585653428e-05,
527
+ "loss": 0.09542192220687866,
528
+ "mean_token_accuracy": 0.9635261118412017,
529
+ "num_tokens": 10682828.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.9362513542175293,
534
+ "epoch": 0.9760589318600368,
535
+ "grad_norm": 0.6421944499015808,
536
+ "learning_rate": 9.938869720229234e-05,
537
+ "loss": 0.09382058382034301,
538
+ "mean_token_accuracy": 0.9648073971271515,
539
+ "num_tokens": 10888741.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.9235438346862793,
544
+ "epoch": 0.994475138121547,
545
+ "grad_norm": 0.7986873388290405,
546
+ "learning_rate": 9.934030638847155e-05,
547
+ "loss": 0.09827429056167603,
548
+ "mean_token_accuracy": 0.9621128737926483,
549
+ "num_tokens": 11094387.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "epoch": 1.0,
554
+ "eval_entropy": 0.9137652366057686,
555
+ "eval_loss": 0.09368764609098434,
556
+ "eval_mean_token_accuracy": 0.9640816880309063,
557
+ "eval_num_tokens": 11155908.0,
558
+ "eval_runtime": 10.4701,
559
+ "eval_samples_per_second": 349.377,
560
+ "eval_steps_per_second": 10.984,
561
+ "step": 543
562
+ },
563
+ {
564
+ "entropy": 0.9047818422317505,
565
+ "epoch": 1.0128913443830572,
566
+ "grad_norm": 0.6781501173973083,
567
+ "learning_rate": 9.929008521021325e-05,
568
+ "loss": 0.0863916516304016,
569
+ "mean_token_accuracy": 0.9673655688762665,
570
+ "num_tokens": 11299715.0,
571
+ "step": 550
572
+ },
573
+ {
574
+ "entropy": 0.8856981039047241,
575
+ "epoch": 1.0313075506445673,
576
+ "grad_norm": 0.7143136858940125,
577
+ "learning_rate": 9.923803553055937e-05,
578
+ "loss": 0.08632323145866394,
579
+ "mean_token_accuracy": 0.9677783191204071,
580
+ "num_tokens": 11505059.0,
581
+ "step": 560
582
+ },
583
+ {
584
+ "entropy": 0.8937099635601043,
585
+ "epoch": 1.0497237569060773,
586
+ "grad_norm": 0.7751694321632385,
587
+ "learning_rate": 9.918415928038325e-05,
588
+ "loss": 0.08178263902664185,
589
+ "mean_token_accuracy": 0.9694291114807129,
590
+ "num_tokens": 11710464.0,
591
+ "step": 570
592
+ },
593
+ {
594
+ "entropy": 0.8858704209327698,
595
+ "epoch": 1.0681399631675874,
596
+ "grad_norm": 0.7492292523384094,
597
+ "learning_rate": 9.912845845831805e-05,
598
+ "loss": 0.08074211478233337,
599
+ "mean_token_accuracy": 0.9692470014095307,
600
+ "num_tokens": 11915959.0,
601
+ "step": 580
602
+ },
603
+ {
604
+ "entropy": 0.8948039829730987,
605
+ "epoch": 1.0865561694290977,
606
+ "grad_norm": 0.8116479516029358,
607
+ "learning_rate": 9.907093513068259e-05,
608
+ "loss": 0.08712012171745301,
609
+ "mean_token_accuracy": 0.9669980227947235,
610
+ "num_tokens": 12121499.0,
611
+ "step": 590
612
+ },
613
+ {
614
+ "entropy": 0.8846789538860321,
615
+ "epoch": 1.1049723756906078,
616
+ "grad_norm": 0.7295626997947693,
617
+ "learning_rate": 9.901159143140471e-05,
618
+ "loss": 0.08444435596466064,
619
+ "mean_token_accuracy": 0.9674544095993042,
620
+ "num_tokens": 12327061.0,
621
+ "step": 600
622
+ },
623
+ {
624
+ "entropy": 0.8734103918075562,
625
+ "epoch": 1.1233885819521179,
626
+ "grad_norm": 0.9585768580436707,
627
+ "learning_rate": 9.89504295619421e-05,
628
+ "loss": 0.08022565841674804,
629
+ "mean_token_accuracy": 0.969569206237793,
630
+ "num_tokens": 12532305.0,
631
+ "step": 610
632
+ },
633
+ {
634
+ "entropy": 0.8640486001968384,
635
+ "epoch": 1.141804788213628,
636
+ "grad_norm": 0.7891159057617188,
637
+ "learning_rate": 9.88874517912006e-05,
638
+ "loss": 0.08415375947952271,
639
+ "mean_token_accuracy": 0.9678892493247986,
640
+ "num_tokens": 12737828.0,
641
+ "step": 620
642
+ },
643
+ {
644
+ "entropy": 0.8599755525588989,
645
+ "epoch": 1.160220994475138,
646
+ "grad_norm": 0.5801345109939575,
647
+ "learning_rate": 9.882266045545012e-05,
648
+ "loss": 0.08100489974021911,
649
+ "mean_token_accuracy": 0.9688023269176483,
650
+ "num_tokens": 12943343.0,
651
+ "step": 630
652
+ },
653
+ {
654
+ "entropy": 0.86524977684021,
655
+ "epoch": 1.1786372007366483,
656
+ "grad_norm": 0.7633041143417358,
657
+ "learning_rate": 9.87560579582379e-05,
658
+ "loss": 0.07859406471252442,
659
+ "mean_token_accuracy": 0.9702189445495606,
660
+ "num_tokens": 13148473.0,
661
+ "step": 640
662
+ },
663
+ {
664
+ "entropy": 0.8466695249080658,
665
+ "epoch": 1.1970534069981584,
666
+ "grad_norm": 0.8672215938568115,
667
+ "learning_rate": 9.868764677029934e-05,
668
+ "loss": 0.08082623481750488,
669
+ "mean_token_accuracy": 0.9689972400665283,
670
+ "num_tokens": 13353890.0,
671
+ "step": 650
672
+ },
673
+ {
674
+ "entropy": 0.8596941530704498,
675
+ "epoch": 1.2154696132596685,
676
+ "grad_norm": 0.7524124383926392,
677
+ "learning_rate": 9.861742942946639e-05,
678
+ "loss": 0.0789935290813446,
679
+ "mean_token_accuracy": 0.9693858206272126,
680
+ "num_tokens": 13559475.0,
681
+ "step": 660
682
+ },
683
+ {
684
+ "entropy": 0.8708749234676361,
685
+ "epoch": 1.2338858195211786,
686
+ "grad_norm": 0.5777031183242798,
687
+ "learning_rate": 9.854540854057337e-05,
688
+ "loss": 0.07773642539978028,
689
+ "mean_token_accuracy": 0.970385092496872,
690
+ "num_tokens": 13765076.0,
691
+ "step": 670
692
+ },
693
+ {
694
+ "entropy": 0.8651713371276856,
695
+ "epoch": 1.2523020257826887,
696
+ "grad_norm": 0.7924166321754456,
697
+ "learning_rate": 9.847158677536034e-05,
698
+ "loss": 0.0766686737537384,
699
+ "mean_token_accuracy": 0.9702267110347748,
700
+ "num_tokens": 13970642.0,
701
+ "step": 680
702
+ },
703
+ {
704
+ "entropy": 0.8763024985790253,
705
+ "epoch": 1.270718232044199,
706
+ "grad_norm": 0.741219162940979,
707
+ "learning_rate": 9.839596687237403e-05,
708
+ "loss": 0.07189929485321045,
709
+ "mean_token_accuracy": 0.9727097094058991,
710
+ "num_tokens": 14176556.0,
711
+ "step": 690
712
+ },
713
+ {
714
+ "entropy": 0.8556921362876893,
715
+ "epoch": 1.289134438305709,
716
+ "grad_norm": 0.6298198103904724,
717
+ "learning_rate": 9.831855163686618e-05,
718
+ "loss": 0.07608137726783752,
719
+ "mean_token_accuracy": 0.9716399371623993,
720
+ "num_tokens": 14381686.0,
721
+ "step": 700
722
+ },
723
+ {
724
+ "entropy": 0.869178420305252,
725
+ "epoch": 1.3075506445672191,
726
+ "grad_norm": 0.5850273370742798,
727
+ "learning_rate": 9.823934394068952e-05,
728
+ "loss": 0.07437651753425598,
729
+ "mean_token_accuracy": 0.9709566533565521,
730
+ "num_tokens": 14586814.0,
731
+ "step": 710
732
+ },
733
+ {
734
+ "entropy": 0.8708595156669616,
735
+ "epoch": 1.3259668508287292,
736
+ "grad_norm": 0.6580632328987122,
737
+ "learning_rate": 9.815834672219127e-05,
738
+ "loss": 0.07518917322158813,
739
+ "mean_token_accuracy": 0.9717426657676697,
740
+ "num_tokens": 14792321.0,
741
+ "step": 720
742
+ },
743
+ {
744
+ "entropy": 0.8826817810535431,
745
+ "epoch": 1.3443830570902393,
746
+ "grad_norm": 0.8788532018661499,
747
+ "learning_rate": 9.807556298610404e-05,
748
+ "loss": 0.07579240798950196,
749
+ "mean_token_accuracy": 0.9706341981887817,
750
+ "num_tokens": 14997810.0,
751
+ "step": 730
752
+ },
753
+ {
754
+ "entropy": 0.9012470185756684,
755
+ "epoch": 1.3627992633517496,
756
+ "grad_norm": 0.7022138237953186,
757
+ "learning_rate": 9.799099580343441e-05,
758
+ "loss": 0.0775588572025299,
759
+ "mean_token_accuracy": 0.9699241399765015,
760
+ "num_tokens": 15203795.0,
761
+ "step": 740
762
+ },
763
+ {
764
+ "entropy": 0.886955714225769,
765
+ "epoch": 1.3812154696132597,
766
+ "grad_norm": 0.7881133556365967,
767
+ "learning_rate": 9.790464831134903e-05,
768
+ "loss": 0.07125020027160645,
769
+ "mean_token_accuracy": 0.9723815560340882,
770
+ "num_tokens": 15408974.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9047374844551086,
775
+ "epoch": 1.3996316758747698,
776
+ "grad_norm": 0.9082005023956299,
777
+ "learning_rate": 9.781652371305824e-05,
778
+ "loss": 0.07004334926605224,
779
+ "mean_token_accuracy": 0.9725580036640167,
780
+ "num_tokens": 15614399.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9039053857326508,
785
+ "epoch": 1.4180478821362799,
786
+ "grad_norm": 0.8060817122459412,
787
+ "learning_rate": 9.77266252776972e-05,
788
+ "loss": 0.07103485465049744,
789
+ "mean_token_accuracy": 0.9721468150615692,
790
+ "num_tokens": 15819895.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.8998047232627868,
795
+ "epoch": 1.43646408839779,
796
+ "grad_norm": 1.0152642726898193,
797
+ "learning_rate": 9.763495634020467e-05,
798
+ "loss": 0.07411704063415528,
799
+ "mean_token_accuracy": 0.9711063146591187,
800
+ "num_tokens": 16025297.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.9120213568210602,
805
+ "epoch": 1.4548802946593002,
806
+ "grad_norm": 0.6288319826126099,
807
+ "learning_rate": 9.754152030119921e-05,
808
+ "loss": 0.07223712205886841,
809
+ "mean_token_accuracy": 0.9722476422786712,
810
+ "num_tokens": 16230656.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.9142370820045471,
815
+ "epoch": 1.4732965009208103,
816
+ "grad_norm": 0.7854700088500977,
817
+ "learning_rate": 9.744632062685311e-05,
818
+ "loss": 0.07186744809150696,
819
+ "mean_token_accuracy": 0.972247713804245,
820
+ "num_tokens": 16435943.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.8920814216136932,
825
+ "epoch": 1.4917127071823204,
826
+ "grad_norm": 0.6227074265480042,
827
+ "learning_rate": 9.734936084876383e-05,
828
+ "loss": 0.07016961574554444,
829
+ "mean_token_accuracy": 0.9725603640079499,
830
+ "num_tokens": 16641635.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.891328877210617,
835
+ "epoch": 1.5101289134438307,
836
+ "grad_norm": 0.7601346969604492,
837
+ "learning_rate": 9.725064456382283e-05,
838
+ "loss": 0.07137494087219239,
839
+ "mean_token_accuracy": 0.9722997546195984,
840
+ "num_tokens": 16847194.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.8921217978000641,
845
+ "epoch": 1.5285451197053406,
846
+ "grad_norm": 0.7813850045204163,
847
+ "learning_rate": 9.715017543408233e-05,
848
+ "loss": 0.06890199184417725,
849
+ "mean_token_accuracy": 0.9735044002532959,
850
+ "num_tokens": 17052807.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.9085914671421051,
855
+ "epoch": 1.5469613259668509,
856
+ "grad_norm": 0.6184289455413818,
857
+ "learning_rate": 9.704795718661939e-05,
858
+ "loss": 0.07043765187263488,
859
+ "mean_token_accuracy": 0.9725716531276702,
860
+ "num_tokens": 17258284.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9029861629009247,
865
+ "epoch": 1.565377532228361,
866
+ "grad_norm": 0.7082377076148987,
867
+ "learning_rate": 9.694399361339752e-05,
868
+ "loss": 0.07113839387893676,
869
+ "mean_token_accuracy": 0.9725669205188752,
870
+ "num_tokens": 17464326.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.8856533527374267,
875
+ "epoch": 1.583793738489871,
876
+ "grad_norm": 0.7409216165542603,
877
+ "learning_rate": 9.683828857112627e-05,
878
+ "loss": 0.07077333331108093,
879
+ "mean_token_accuracy": 0.9731084644794464,
880
+ "num_tokens": 17669537.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.8613030433654785,
885
+ "epoch": 1.6022099447513813,
886
+ "grad_norm": 0.6801561713218689,
887
+ "learning_rate": 9.673084598111789e-05,
888
+ "loss": 0.06885308027267456,
889
+ "mean_token_accuracy": 0.97266526222229,
890
+ "num_tokens": 17875289.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.8692965865135193,
895
+ "epoch": 1.6206261510128912,
896
+ "grad_norm": 1.1621277332305908,
897
+ "learning_rate": 9.662166982914203e-05,
898
+ "loss": 0.07017780542373657,
899
+ "mean_token_accuracy": 0.9733059942722321,
900
+ "num_tokens": 18080404.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.8671502113342285,
905
+ "epoch": 1.6390423572744015,
906
+ "grad_norm": 0.7518903613090515,
907
+ "learning_rate": 9.651076416527787e-05,
908
+ "loss": 0.06977018713951111,
909
+ "mean_token_accuracy": 0.9730017304420471,
910
+ "num_tokens": 18285699.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.8662045657634735,
915
+ "epoch": 1.6574585635359116,
916
+ "grad_norm": 0.6622698903083801,
917
+ "learning_rate": 9.639813310376378e-05,
918
+ "loss": 0.06620995998382569,
919
+ "mean_token_accuracy": 0.9737491130828857,
920
+ "num_tokens": 18491097.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.8548173069953918,
925
+ "epoch": 1.6758747697974217,
926
+ "grad_norm": 0.8941843509674072,
927
+ "learning_rate": 9.628378082284479e-05,
928
+ "loss": 0.06711119413375854,
929
+ "mean_token_accuracy": 0.9740589797496796,
930
+ "num_tokens": 18696827.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.8763562262058258,
935
+ "epoch": 1.694290976058932,
936
+ "grad_norm": 0.7571700215339661,
937
+ "learning_rate": 9.616771156461755e-05,
938
+ "loss": 0.07263468503952027,
939
+ "mean_token_accuracy": 0.9717419981956482,
940
+ "num_tokens": 18902513.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.8663733780384064,
945
+ "epoch": 1.7127071823204418,
946
+ "grad_norm": 0.7886489629745483,
947
+ "learning_rate": 9.604992963487298e-05,
948
+ "loss": 0.07074605226516724,
949
+ "mean_token_accuracy": 0.9724965393543243,
950
+ "num_tokens": 19107812.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.8673004627227783,
955
+ "epoch": 1.7311233885819521,
956
+ "grad_norm": 0.8180726170539856,
957
+ "learning_rate": 9.593043940293647e-05,
958
+ "loss": 0.06831735372543335,
959
+ "mean_token_accuracy": 0.9733696818351746,
960
+ "num_tokens": 19313330.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.8525971233844757,
965
+ "epoch": 1.7495395948434622,
966
+ "grad_norm": 0.6576228737831116,
967
+ "learning_rate": 9.580924530150595e-05,
968
+ "loss": 0.06567002534866333,
969
+ "mean_token_accuracy": 0.9745754361152649,
970
+ "num_tokens": 19518671.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.8605451703071594,
975
+ "epoch": 1.7679558011049723,
976
+ "grad_norm": 0.7171661257743835,
977
+ "learning_rate": 9.568635182648725e-05,
978
+ "loss": 0.06872050762176514,
979
+ "mean_token_accuracy": 0.9732091546058654,
980
+ "num_tokens": 19724135.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.8642210960388184,
985
+ "epoch": 1.7863720073664826,
986
+ "grad_norm": 0.7603147029876709,
987
+ "learning_rate": 9.556176353682746e-05,
988
+ "loss": 0.06766576766967773,
989
+ "mean_token_accuracy": 0.9728681743144989,
990
+ "num_tokens": 19928785.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.8543185651302337,
995
+ "epoch": 1.8047882136279927,
996
+ "grad_norm": 0.7280875444412231,
997
+ "learning_rate": 9.543548505434581e-05,
998
+ "loss": 0.06851862668991089,
999
+ "mean_token_accuracy": 0.9737437188625335,
1000
+ "num_tokens": 20134195.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.8744745373725891,
1005
+ "epoch": 1.8232044198895028,
1006
+ "grad_norm": 0.5897248983383179,
1007
+ "learning_rate": 9.530752106356209e-05,
1008
+ "loss": 0.06809053421020508,
1009
+ "mean_token_accuracy": 0.9733593761920929,
1010
+ "num_tokens": 20339517.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.8623859465122223,
1015
+ "epoch": 1.8416206261510129,
1016
+ "grad_norm": 0.7515265345573425,
1017
+ "learning_rate": 9.517787631152298e-05,
1018
+ "loss": 0.07257847785949707,
1019
+ "mean_token_accuracy": 0.9714054942131043,
1020
+ "num_tokens": 20545249.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.8669404804706573,
1025
+ "epoch": 1.860036832412523,
1026
+ "grad_norm": 0.7144560813903809,
1027
+ "learning_rate": 9.504655560762596e-05,
1028
+ "loss": 0.06832354068756104,
1029
+ "mean_token_accuracy": 0.9735779523849487,
1030
+ "num_tokens": 20750507.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.8493516445159912,
1035
+ "epoch": 1.8784530386740332,
1036
+ "grad_norm": 0.6559189558029175,
1037
+ "learning_rate": 9.491356382344081e-05,
1038
+ "loss": 0.0629766047000885,
1039
+ "mean_token_accuracy": 0.9754977762699127,
1040
+ "num_tokens": 20955956.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.8599376022815705,
1045
+ "epoch": 1.8968692449355433,
1046
+ "grad_norm": 0.6792973279953003,
1047
+ "learning_rate": 9.477890589252895e-05,
1048
+ "loss": 0.0666757881641388,
1049
+ "mean_token_accuracy": 0.974083811044693,
1050
+ "num_tokens": 21161163.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.8458438158035279,
1055
+ "epoch": 1.9152854511970534,
1056
+ "grad_norm": 0.6941778659820557,
1057
+ "learning_rate": 9.464258681026042e-05,
1058
+ "loss": 0.06307152509689332,
1059
+ "mean_token_accuracy": 0.9757042229175568,
1060
+ "num_tokens": 21366525.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.848515909910202,
1065
+ "epoch": 1.9337016574585635,
1066
+ "grad_norm": 0.7307806611061096,
1067
+ "learning_rate": 9.450461163362855e-05,
1068
+ "loss": 0.06307026147842407,
1069
+ "mean_token_accuracy": 0.9750974595546722,
1070
+ "num_tokens": 21572238.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.8563454031944275,
1075
+ "epoch": 1.9521178637200736,
1076
+ "grad_norm": 0.7222106456756592,
1077
+ "learning_rate": 9.436498548106236e-05,
1078
+ "loss": 0.0647726058959961,
1079
+ "mean_token_accuracy": 0.974629694223404,
1080
+ "num_tokens": 21777633.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.8656457483768463,
1085
+ "epoch": 1.9705340699815839,
1086
+ "grad_norm": 0.67178875207901,
1087
+ "learning_rate": 9.422371353223674e-05,
1088
+ "loss": 0.06573554277420043,
1089
+ "mean_token_accuracy": 0.9745908617973328,
1090
+ "num_tokens": 21983116.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.8630891263484954,
1095
+ "epoch": 1.988950276243094,
1096
+ "grad_norm": 0.6956593990325928,
1097
+ "learning_rate": 9.408080102788016e-05,
1098
+ "loss": 0.06630704402923585,
1099
+ "mean_token_accuracy": 0.9741333484649658,
1100
+ "num_tokens": 22188662.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "epoch": 2.0,
1105
+ "eval_entropy": 0.8560857042022373,
1106
+ "eval_loss": 0.06494329869747162,
1107
+ "eval_mean_token_accuracy": 0.9745692672936813,
1108
+ "eval_num_tokens": 22311800.0,
1109
+ "eval_runtime": 10.129,
1110
+ "eval_samples_per_second": 361.142,
1111
+ "eval_steps_per_second": 11.354,
1112
+ "step": 1086
1113
+ },
1114
+ {
1115
+ "entropy": 0.8616272270679474,
1116
+ "epoch": 2.007366482504604,
1117
+ "grad_norm": 0.7778105139732361,
1118
+ "learning_rate": 9.393625326958041e-05,
1119
+ "loss": 0.054407155513763426,
1120
+ "mean_token_accuracy": 0.9792074799537659,
1121
+ "num_tokens": 22394215.0,
1122
+ "step": 1090
1123
+ },
1124
+ {
1125
+ "entropy": 0.8496910452842712,
1126
+ "epoch": 2.0257826887661143,
1127
+ "grad_norm": 0.7422528266906738,
1128
+ "learning_rate": 9.379007561958792e-05,
1129
+ "loss": 0.051881587505340575,
1130
+ "mean_token_accuracy": 0.9799090325832367,
1131
+ "num_tokens": 22599599.0,
1132
+ "step": 1100
1133
+ },
1134
+ {
1135
+ "entropy": 0.8531602442264556,
1136
+ "epoch": 2.044198895027624,
1137
+ "grad_norm": 0.9075332880020142,
1138
+ "learning_rate": 9.36422735006167e-05,
1139
+ "loss": 0.05190724730491638,
1140
+ "mean_token_accuracy": 0.979931116104126,
1141
+ "num_tokens": 22805318.0,
1142
+ "step": 1110
1143
+ },
1144
+ {
1145
+ "entropy": 0.8657277703285218,
1146
+ "epoch": 2.0626151012891345,
1147
+ "grad_norm": 0.9466913938522339,
1148
+ "learning_rate": 9.349285239564325e-05,
1149
+ "loss": 0.053853434324264524,
1150
+ "mean_token_accuracy": 0.9796103596687317,
1151
+ "num_tokens": 23010438.0,
1152
+ "step": 1120
1153
+ },
1154
+ {
1155
+ "entropy": 0.8578485429286957,
1156
+ "epoch": 2.0810313075506444,
1157
+ "grad_norm": 0.6903054714202881,
1158
+ "learning_rate": 9.334181784770326e-05,
1159
+ "loss": 0.05228850841522217,
1160
+ "mean_token_accuracy": 0.9802409887313843,
1161
+ "num_tokens": 23215795.0,
1162
+ "step": 1130
1163
+ },
1164
+ {
1165
+ "entropy": 0.8450767934322357,
1166
+ "epoch": 2.0994475138121547,
1167
+ "grad_norm": 0.6615211367607117,
1168
+ "learning_rate": 9.318917545968581e-05,
1169
+ "loss": 0.050570905208587646,
1170
+ "mean_token_accuracy": 0.9802053451538086,
1171
+ "num_tokens": 23421157.0,
1172
+ "step": 1140
1173
+ },
1174
+ {
1175
+ "entropy": 0.8325044393539429,
1176
+ "epoch": 2.117863720073665,
1177
+ "grad_norm": 0.760960578918457,
1178
+ "learning_rate": 9.303493089412564e-05,
1179
+ "loss": 0.051966112852096555,
1180
+ "mean_token_accuracy": 0.9796205997467041,
1181
+ "num_tokens": 23626584.0,
1182
+ "step": 1150
1183
+ },
1184
+ {
1185
+ "entropy": 0.8416404843330383,
1186
+ "epoch": 2.136279926335175,
1187
+ "grad_norm": 0.6947009563446045,
1188
+ "learning_rate": 9.287908987299306e-05,
1189
+ "loss": 0.05144861936569214,
1190
+ "mean_token_accuracy": 0.9800034642219544,
1191
+ "num_tokens": 23832137.0,
1192
+ "step": 1160
1193
+ },
1194
+ {
1195
+ "entropy": 0.8564540028572083,
1196
+ "epoch": 2.154696132596685,
1197
+ "grad_norm": 0.733252763748169,
1198
+ "learning_rate": 9.272165817748164e-05,
1199
+ "loss": 0.04944799542427063,
1200
+ "mean_token_accuracy": 0.9808157980442047,
1201
+ "num_tokens": 24038006.0,
1202
+ "step": 1170
1203
+ },
1204
+ {
1205
+ "entropy": 0.8575525343418121,
1206
+ "epoch": 2.1731123388581954,
1207
+ "grad_norm": 0.8911028504371643,
1208
+ "learning_rate": 9.25626416477938e-05,
1209
+ "loss": 0.05037952661514282,
1210
+ "mean_token_accuracy": 0.980946284532547,
1211
+ "num_tokens": 24243374.0,
1212
+ "step": 1180
1213
+ },
1214
+ {
1215
+ "entropy": 0.8599720418453216,
1216
+ "epoch": 2.1915285451197053,
1217
+ "grad_norm": 0.7713524103164673,
1218
+ "learning_rate": 9.240204618292416e-05,
1219
+ "loss": 0.050603735446929934,
1220
+ "mean_token_accuracy": 0.980896121263504,
1221
+ "num_tokens": 24448585.0,
1222
+ "step": 1190
1223
+ },
1224
+ {
1225
+ "entropy": 0.8566664934158326,
1226
+ "epoch": 2.2099447513812156,
1227
+ "grad_norm": 0.8439353704452515,
1228
+ "learning_rate": 9.223987774044066e-05,
1229
+ "loss": 0.054171699285507205,
1230
+ "mean_token_accuracy": 0.9796543836593627,
1231
+ "num_tokens": 24653863.0,
1232
+ "step": 1200
1233
+ },
1234
+ {
1235
+ "entropy": 0.846601277589798,
1236
+ "epoch": 2.2283609576427255,
1237
+ "grad_norm": 0.7025637030601501,
1238
+ "learning_rate": 9.207614233626356e-05,
1239
+ "loss": 0.048924127221107484,
1240
+ "mean_token_accuracy": 0.9809681415557862,
1241
+ "num_tokens": 24859801.0,
1242
+ "step": 1210
1243
+ },
1244
+ {
1245
+ "entropy": 0.8564423739910125,
1246
+ "epoch": 2.2467771639042358,
1247
+ "grad_norm": 0.7788274884223938,
1248
+ "learning_rate": 9.191084604444233e-05,
1249
+ "loss": 0.05260283350944519,
1250
+ "mean_token_accuracy": 0.9793797850608825,
1251
+ "num_tokens": 25065368.0,
1252
+ "step": 1220
1253
+ },
1254
+ {
1255
+ "entropy": 0.865056723356247,
1256
+ "epoch": 2.265193370165746,
1257
+ "grad_norm": 0.8728818297386169,
1258
+ "learning_rate": 9.174399499693027e-05,
1259
+ "loss": 0.05016371011734009,
1260
+ "mean_token_accuracy": 0.9807134211063385,
1261
+ "num_tokens": 25270945.0,
1262
+ "step": 1230
1263
+ },
1264
+ {
1265
+ "entropy": 0.8642262935638427,
1266
+ "epoch": 2.283609576427256,
1267
+ "grad_norm": 1.0582489967346191,
1268
+ "learning_rate": 9.157559538335703e-05,
1269
+ "loss": 0.05316779017448425,
1270
+ "mean_token_accuracy": 0.9794209063053131,
1271
+ "num_tokens": 25476575.0,
1272
+ "step": 1240
1273
+ },
1274
+ {
1275
+ "entropy": 0.8677761554718018,
1276
+ "epoch": 2.3020257826887662,
1277
+ "grad_norm": 0.760109543800354,
1278
+ "learning_rate": 9.140565345079901e-05,
1279
+ "loss": 0.05115479230880737,
1280
+ "mean_token_accuracy": 0.9802310705184937,
1281
+ "num_tokens": 25682814.0,
1282
+ "step": 1250
1283
+ },
1284
+ {
1285
+ "entropy": 0.8592945456504821,
1286
+ "epoch": 2.320441988950276,
1287
+ "grad_norm": 0.6537907123565674,
1288
+ "learning_rate": 9.123417550354761e-05,
1289
+ "loss": 0.050543540716171266,
1290
+ "mean_token_accuracy": 0.9806945025920868,
1291
+ "num_tokens": 25887575.0,
1292
+ "step": 1260
1293
+ },
1294
+ {
1295
+ "entropy": 0.8692500293254852,
1296
+ "epoch": 2.3388581952117864,
1297
+ "grad_norm": 0.7771905064582825,
1298
+ "learning_rate": 9.106116790287541e-05,
1299
+ "loss": 0.049718713760375975,
1300
+ "mean_token_accuracy": 0.9805168390274048,
1301
+ "num_tokens": 26092950.0,
1302
+ "step": 1270
1303
+ },
1304
+ {
1305
+ "entropy": 0.8841261565685272,
1306
+ "epoch": 2.3572744014732967,
1307
+ "grad_norm": 0.7791076898574829,
1308
+ "learning_rate": 9.08866370668001e-05,
1309
+ "loss": 0.0527400553226471,
1310
+ "mean_token_accuracy": 0.9796754539012908,
1311
+ "num_tokens": 26298182.0,
1312
+ "step": 1280
1313
+ },
1314
+ {
1315
+ "entropy": 0.8675022900104523,
1316
+ "epoch": 2.3756906077348066,
1317
+ "grad_norm": 0.8481605648994446,
1318
+ "learning_rate": 9.07105894698464e-05,
1319
+ "loss": 0.05320838689804077,
1320
+ "mean_token_accuracy": 0.9792274832725525,
1321
+ "num_tokens": 26503425.0,
1322
+ "step": 1290
1323
+ },
1324
+ {
1325
+ "entropy": 0.8704026222229004,
1326
+ "epoch": 2.394106813996317,
1327
+ "grad_norm": 0.8235505819320679,
1328
+ "learning_rate": 9.053303164280602e-05,
1329
+ "loss": 0.055045205354690555,
1330
+ "mean_token_accuracy": 0.9788750648498535,
1331
+ "num_tokens": 26708755.0,
1332
+ "step": 1300
1333
+ },
1334
+ {
1335
+ "entropy": 0.8525134027004242,
1336
+ "epoch": 2.4125230202578267,
1337
+ "grad_norm": 0.7611598968505859,
1338
+ "learning_rate": 9.035397017249518e-05,
1339
+ "loss": 0.05029621124267578,
1340
+ "mean_token_accuracy": 0.9802757322788238,
1341
+ "num_tokens": 26914704.0,
1342
+ "step": 1310
1343
+ },
1344
+ {
1345
+ "entropy": 0.8630305290222168,
1346
+ "epoch": 2.430939226519337,
1347
+ "grad_norm": 0.790408194065094,
1348
+ "learning_rate": 9.017341170151041e-05,
1349
+ "loss": 0.04856040775775909,
1350
+ "mean_token_accuracy": 0.9809690833091735,
1351
+ "num_tokens": 27120151.0,
1352
+ "step": 1320
1353
+ },
1354
+ {
1355
+ "entropy": 0.8579159140586853,
1356
+ "epoch": 2.4493554327808473,
1357
+ "grad_norm": 0.781972348690033,
1358
+ "learning_rate": 8.999136292798207e-05,
1359
+ "loss": 0.04869682788848877,
1360
+ "mean_token_accuracy": 0.9816130697727203,
1361
+ "num_tokens": 27325673.0,
1362
+ "step": 1330
1363
+ },
1364
+ {
1365
+ "entropy": 0.8634716987609863,
1366
+ "epoch": 2.467771639042357,
1367
+ "grad_norm": 0.8500784039497375,
1368
+ "learning_rate": 8.980783060532588e-05,
1369
+ "loss": 0.05050289034843445,
1370
+ "mean_token_accuracy": 0.980079609155655,
1371
+ "num_tokens": 27531270.0,
1372
+ "step": 1340
1373
+ },
1374
+ {
1375
+ "entropy": 0.8660618126392364,
1376
+ "epoch": 2.4861878453038675,
1377
+ "grad_norm": 0.719760537147522,
1378
+ "learning_rate": 8.96228215419924e-05,
1379
+ "loss": 0.04892141819000244,
1380
+ "mean_token_accuracy": 0.9814020991325378,
1381
+ "num_tokens": 27736542.0,
1382
+ "step": 1350
1383
+ },
1384
+ {
1385
+ "entropy": 0.8572284400463104,
1386
+ "epoch": 2.5046040515653774,
1387
+ "grad_norm": 1.0197229385375977,
1388
+ "learning_rate": 8.943634260121442e-05,
1389
+ "loss": 0.05104702711105347,
1390
+ "mean_token_accuracy": 0.9798846662044525,
1391
+ "num_tokens": 27941566.0,
1392
+ "step": 1360
1393
+ },
1394
+ {
1395
+ "entropy": 0.8702241241931915,
1396
+ "epoch": 2.5230202578268877,
1397
+ "grad_norm": 0.7136003375053406,
1398
+ "learning_rate": 8.924840070075247e-05,
1399
+ "loss": 0.04855787754058838,
1400
+ "mean_token_accuracy": 0.9811685383319855,
1401
+ "num_tokens": 28146943.0,
1402
+ "step": 1370
1403
+ },
1404
+ {
1405
+ "entropy": 0.874957013130188,
1406
+ "epoch": 2.541436464088398,
1407
+ "grad_norm": 0.8775497674942017,
1408
+ "learning_rate": 8.905900281263804e-05,
1409
+ "loss": 0.052434295415878296,
1410
+ "mean_token_accuracy": 0.9795438170433044,
1411
+ "num_tokens": 28352640.0,
1412
+ "step": 1380
1413
+ },
1414
+ {
1415
+ "entropy": 0.8776536166667939,
1416
+ "epoch": 2.559852670349908,
1417
+ "grad_norm": 0.8895741105079651,
1418
+ "learning_rate": 8.8868155962915e-05,
1419
+ "loss": 0.05282890796661377,
1420
+ "mean_token_accuracy": 0.9790538609027862,
1421
+ "num_tokens": 28558153.0,
1422
+ "step": 1390
1423
+ },
1424
+ {
1425
+ "entropy": 0.8738743245601654,
1426
+ "epoch": 2.578268876611418,
1427
+ "grad_norm": 0.788800060749054,
1428
+ "learning_rate": 8.867586723137906e-05,
1429
+ "loss": 0.048841872811317445,
1430
+ "mean_token_accuracy": 0.9809149026870727,
1431
+ "num_tokens": 28763613.0,
1432
+ "step": 1400
1433
+ },
1434
+ {
1435
+ "entropy": 0.8750253796577454,
1436
+ "epoch": 2.596685082872928,
1437
+ "grad_norm": 0.8738002777099609,
1438
+ "learning_rate": 8.848214375131497e-05,
1439
+ "loss": 0.048261132836341855,
1440
+ "mean_token_accuracy": 0.980789190530777,
1441
+ "num_tokens": 28969248.0,
1442
+ "step": 1410
1443
+ },
1444
+ {
1445
+ "entropy": 0.8624245524406433,
1446
+ "epoch": 2.6151012891344383,
1447
+ "grad_norm": 0.6404895186424255,
1448
+ "learning_rate": 8.828699270923196e-05,
1449
+ "loss": 0.04970468282699585,
1450
+ "mean_token_accuracy": 0.9807762265205383,
1451
+ "num_tokens": 29174779.0,
1452
+ "step": 1420
1453
+ },
1454
+ {
1455
+ "entropy": 0.8792938470840455,
1456
+ "epoch": 2.6335174953959486,
1457
+ "grad_norm": 0.7856965661048889,
1458
+ "learning_rate": 8.80904213445972e-05,
1459
+ "loss": 0.053334391117095946,
1460
+ "mean_token_accuracy": 0.9790222108364105,
1461
+ "num_tokens": 29380474.0,
1462
+ "step": 1430
1463
+ },
1464
+ {
1465
+ "entropy": 0.8831034600734711,
1466
+ "epoch": 2.6519337016574585,
1467
+ "grad_norm": 0.7739618420600891,
1468
+ "learning_rate": 8.789243694956716e-05,
1469
+ "loss": 0.04959054589271546,
1470
+ "mean_token_accuracy": 0.9803965091705322,
1471
+ "num_tokens": 29585985.0,
1472
+ "step": 1440
1473
+ },
1474
+ {
1475
+ "entropy": 0.8934672951698304,
1476
+ "epoch": 2.6703499079189688,
1477
+ "grad_norm": 0.6999697089195251,
1478
+ "learning_rate": 8.769304686871719e-05,
1479
+ "loss": 0.05165250301361084,
1480
+ "mean_token_accuracy": 0.9798884153366089,
1481
+ "num_tokens": 29791238.0,
1482
+ "step": 1450
1483
+ },
1484
+ {
1485
+ "entropy": 0.9053199410438537,
1486
+ "epoch": 2.6887661141804786,
1487
+ "grad_norm": 0.9199564456939697,
1488
+ "learning_rate": 8.749225849876892e-05,
1489
+ "loss": 0.04924143850803375,
1490
+ "mean_token_accuracy": 0.9810785710811615,
1491
+ "num_tokens": 29996589.0,
1492
+ "step": 1460
1493
+ },
1494
+ {
1495
+ "entropy": 0.888091403245926,
1496
+ "epoch": 2.707182320441989,
1497
+ "grad_norm": 0.7480106353759766,
1498
+ "learning_rate": 8.729007928831597e-05,
1499
+ "loss": 0.04948916733264923,
1500
+ "mean_token_accuracy": 0.9809579730033875,
1501
+ "num_tokens": 30201875.0,
1502
+ "step": 1470
1503
+ },
1504
+ {
1505
+ "entropy": 0.8723407983779907,
1506
+ "epoch": 2.7255985267034992,
1507
+ "grad_norm": 0.9506945013999939,
1508
+ "learning_rate": 8.708651673754763e-05,
1509
+ "loss": 0.048927539587020875,
1510
+ "mean_token_accuracy": 0.980553150177002,
1511
+ "num_tokens": 30407550.0,
1512
+ "step": 1480
1513
+ },
1514
+ {
1515
+ "entropy": 0.8737521529197693,
1516
+ "epoch": 2.744014732965009,
1517
+ "grad_norm": 0.8015706539154053,
1518
+ "learning_rate": 8.688157839797062e-05,
1519
+ "loss": 0.04963063597679138,
1520
+ "mean_token_accuracy": 0.9809738755226135,
1521
+ "num_tokens": 30612839.0,
1522
+ "step": 1490
1523
+ },
1524
+ {
1525
+ "entropy": 0.8800762951374054,
1526
+ "epoch": 2.7624309392265194,
1527
+ "grad_norm": 0.9429986476898193,
1528
+ "learning_rate": 8.667527187212885e-05,
1529
+ "loss": 0.0524174690246582,
1530
+ "mean_token_accuracy": 0.9788767337799072,
1531
+ "num_tokens": 30818578.0,
1532
+ "step": 1500
1533
+ },
1534
+ {
1535
+ "entropy": 0.8871055901050567,
1536
+ "epoch": 2.7808471454880292,
1537
+ "grad_norm": 0.5909196138381958,
1538
+ "learning_rate": 8.646760481332157e-05,
1539
+ "loss": 0.05166680812835693,
1540
+ "mean_token_accuracy": 0.980216771364212,
1541
+ "num_tokens": 31023829.0,
1542
+ "step": 1510
1543
+ },
1544
+ {
1545
+ "entropy": 0.8908755779266357,
1546
+ "epoch": 2.7992633517495396,
1547
+ "grad_norm": 0.9154611229896545,
1548
+ "learning_rate": 8.625858492531931e-05,
1549
+ "loss": 0.04951836466789246,
1550
+ "mean_token_accuracy": 0.9801484227180481,
1551
+ "num_tokens": 31229635.0,
1552
+ "step": 1520
1553
+ },
1554
+ {
1555
+ "entropy": 0.92480548620224,
1556
+ "epoch": 2.81767955801105,
1557
+ "grad_norm": 0.5989938378334045,
1558
+ "learning_rate": 8.604821996207819e-05,
1559
+ "loss": 0.04799881279468536,
1560
+ "mean_token_accuracy": 0.9817522585391998,
1561
+ "num_tokens": 31435456.0,
1562
+ "step": 1530
1563
+ },
1564
+ {
1565
+ "entropy": 0.9173881888389588,
1566
+ "epoch": 2.8360957642725597,
1567
+ "grad_norm": 0.899413526058197,
1568
+ "learning_rate": 8.58365177274522e-05,
1569
+ "loss": 0.0487445592880249,
1570
+ "mean_token_accuracy": 0.9812625288963318,
1571
+ "num_tokens": 31640904.0,
1572
+ "step": 1540
1573
+ },
1574
+ {
1575
+ "entropy": 0.9076135993003845,
1576
+ "epoch": 2.85451197053407,
1577
+ "grad_norm": 0.8494166135787964,
1578
+ "learning_rate": 8.562348607490376e-05,
1579
+ "loss": 0.05005228519439697,
1580
+ "mean_token_accuracy": 0.9806681036949157,
1581
+ "num_tokens": 31845807.0,
1582
+ "step": 1550
1583
+ },
1584
+ {
1585
+ "entropy": 0.9092245221138,
1586
+ "epoch": 2.87292817679558,
1587
+ "grad_norm": 0.8225123286247253,
1588
+ "learning_rate": 8.540913290721234e-05,
1589
+ "loss": 0.048654764890670776,
1590
+ "mean_token_accuracy": 0.9805659353733063,
1591
+ "num_tokens": 32051523.0,
1592
+ "step": 1560
1593
+ },
1594
+ {
1595
+ "entropy": 0.9062779664993286,
1596
+ "epoch": 2.89134438305709,
1597
+ "grad_norm": 0.7074014544487,
1598
+ "learning_rate": 8.519346617618134e-05,
1599
+ "loss": 0.049209845066070554,
1600
+ "mean_token_accuracy": 0.9807434439659118,
1601
+ "num_tokens": 32256895.0,
1602
+ "step": 1570
1603
+ },
1604
+ {
1605
+ "entropy": 0.9190246641635895,
1606
+ "epoch": 2.9097605893186005,
1607
+ "grad_norm": 0.8860642910003662,
1608
+ "learning_rate": 8.497649388234304e-05,
1609
+ "loss": 0.051211881637573245,
1610
+ "mean_token_accuracy": 0.9802342295646668,
1611
+ "num_tokens": 32462031.0,
1612
+ "step": 1580
1613
+ },
1614
+ {
1615
+ "entropy": 0.9088015079498291,
1616
+ "epoch": 2.9281767955801103,
1617
+ "grad_norm": 0.8062726855278015,
1618
+ "learning_rate": 8.475822407466188e-05,
1619
+ "loss": 0.053512704372406,
1620
+ "mean_token_accuracy": 0.979486483335495,
1621
+ "num_tokens": 32667533.0,
1622
+ "step": 1590
1623
+ },
1624
+ {
1625
+ "entropy": 0.9462027847766876,
1626
+ "epoch": 2.9465930018416207,
1627
+ "grad_norm": 0.7962909936904907,
1628
+ "learning_rate": 8.453866485023579e-05,
1629
+ "loss": 0.0501457154750824,
1630
+ "mean_token_accuracy": 0.9803222417831421,
1631
+ "num_tokens": 32872900.0,
1632
+ "step": 1600
1633
+ },
1634
+ {
1635
+ "entropy": 0.9671471297740937,
1636
+ "epoch": 2.9650092081031305,
1637
+ "grad_norm": 0.7641744017601013,
1638
+ "learning_rate": 8.431782435399587e-05,
1639
+ "loss": 0.04629061222076416,
1640
+ "mean_token_accuracy": 0.9823175370693207,
1641
+ "num_tokens": 33077850.0,
1642
+ "step": 1610
1643
+ },
1644
+ {
1645
+ "entropy": 0.955865204334259,
1646
+ "epoch": 2.983425414364641,
1647
+ "grad_norm": 0.6772348880767822,
1648
+ "learning_rate": 8.409571077840426e-05,
1649
+ "loss": 0.048368623852729796,
1650
+ "mean_token_accuracy": 0.9808700799942016,
1651
+ "num_tokens": 33283117.0,
1652
+ "step": 1620
1653
+ },
1654
+ {
1655
+ "epoch": 3.0,
1656
+ "eval_entropy": 0.9563225186389426,
1657
+ "eval_loss": 0.059064481407403946,
1658
+ "eval_mean_token_accuracy": 0.9773589429648026,
1659
+ "eval_num_tokens": 33467712.0,
1660
+ "eval_runtime": 10.1471,
1661
+ "eval_samples_per_second": 360.499,
1662
+ "eval_steps_per_second": 11.333,
1663
+ "step": 1629
1664
+ }
1665
+ ],
1666
+ "logging_steps": 10,
1667
+ "max_steps": 5430,
1668
+ "num_input_tokens_seen": 0,
1669
+ "num_train_epochs": 10,
1670
+ "save_steps": 500,
1671
+ "stateful_callbacks": {
1672
+ "TrainerControl": {
1673
+ "args": {
1674
+ "should_epoch_stop": false,
1675
+ "should_evaluate": false,
1676
+ "should_log": false,
1677
+ "should_save": true,
1678
+ "should_training_stop": false
1679
+ },
1680
+ "attributes": {}
1681
+ }
1682
+ },
1683
+ "total_flos": 1.595677368674943e+18,
1684
+ "train_batch_size": 32,
1685
+ "trial_name": null,
1686
+ "trial_params": null
1687
+ }
checkpoint-1629/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75
3
+ size 5777
checkpoint-2172/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-2172/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-2172/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c86f8f8223f27673f137496bfe71dc599b6baf7e185de17ad979b78a2ac98e6
3
+ size 80792096
checkpoint-2172/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2172/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-2172/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-2172/trainer_state.json ADDED
@@ -0,0 +1,2248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2172,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2237394809722901,
14
+ "epoch": 0.01841620626151013,
15
+ "grad_norm": 5.082435607910156,
16
+ "learning_rate": 3.308823529411765e-06,
17
+ "loss": 0.9237876892089844,
18
+ "mean_token_accuracy": 0.7685343027114868,
19
+ "num_tokens": 205423.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2295925617218018,
24
+ "epoch": 0.03683241252302026,
25
+ "grad_norm": 4.672000408172607,
26
+ "learning_rate": 6.985294117647059e-06,
27
+ "loss": 0.8900892257690429,
28
+ "mean_token_accuracy": 0.7677771031856537,
29
+ "num_tokens": 410849.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2285718679428101,
34
+ "epoch": 0.055248618784530384,
35
+ "grad_norm": 1.4828118085861206,
36
+ "learning_rate": 1.0661764705882354e-05,
37
+ "loss": 0.5975452899932862,
38
+ "mean_token_accuracy": 0.8146551787853241,
39
+ "num_tokens": 616438.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.210776400566101,
44
+ "epoch": 0.07366482504604052,
45
+ "grad_norm": 0.7761328816413879,
46
+ "learning_rate": 1.4338235294117647e-05,
47
+ "loss": 0.40664992332458494,
48
+ "mean_token_accuracy": 0.8699092030525207,
49
+ "num_tokens": 822118.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.200321125984192,
54
+ "epoch": 0.09208103130755065,
55
+ "grad_norm": 0.5363371968269348,
56
+ "learning_rate": 1.8014705882352943e-05,
57
+ "loss": 0.3313469409942627,
58
+ "mean_token_accuracy": 0.8904915869235992,
59
+ "num_tokens": 1027941.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.1809936046600342,
64
+ "epoch": 0.11049723756906077,
65
+ "grad_norm": 0.39541518688201904,
66
+ "learning_rate": 2.1691176470588237e-05,
67
+ "loss": 0.27568228244781495,
68
+ "mean_token_accuracy": 0.9047131836414337,
69
+ "num_tokens": 1233620.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.169810914993286,
74
+ "epoch": 0.1289134438305709,
75
+ "grad_norm": 0.341960072517395,
76
+ "learning_rate": 2.536764705882353e-05,
77
+ "loss": 0.245219087600708,
78
+ "mean_token_accuracy": 0.9150686681270599,
79
+ "num_tokens": 1438656.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.1652960777282715,
84
+ "epoch": 0.14732965009208104,
85
+ "grad_norm": 0.36872178316116333,
86
+ "learning_rate": 2.9044117647058828e-05,
87
+ "loss": 0.2220149040222168,
88
+ "mean_token_accuracy": 0.9224777698516846,
89
+ "num_tokens": 1643877.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.154341197013855,
94
+ "epoch": 0.16574585635359115,
95
+ "grad_norm": 0.4152425229549408,
96
+ "learning_rate": 3.272058823529412e-05,
97
+ "loss": 0.2002798557281494,
98
+ "mean_token_accuracy": 0.9285802960395813,
99
+ "num_tokens": 1849506.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.1507258892059327,
104
+ "epoch": 0.1841620626151013,
105
+ "grad_norm": 0.47647765278816223,
106
+ "learning_rate": 3.639705882352941e-05,
107
+ "loss": 0.18871363401412963,
108
+ "mean_token_accuracy": 0.9318056285381318,
109
+ "num_tokens": 2055071.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.1455535531044005,
114
+ "epoch": 0.20257826887661143,
115
+ "grad_norm": 0.4853009581565857,
116
+ "learning_rate": 4.007352941176471e-05,
117
+ "loss": 0.17836341857910157,
118
+ "mean_token_accuracy": 0.9367631554603577,
119
+ "num_tokens": 2260643.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.1402526497840881,
124
+ "epoch": 0.22099447513812154,
125
+ "grad_norm": 0.4455392360687256,
126
+ "learning_rate": 4.375e-05,
127
+ "loss": 0.16921783685684205,
128
+ "mean_token_accuracy": 0.9386959195137023,
129
+ "num_tokens": 2466085.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.1374777555465698,
134
+ "epoch": 0.23941068139963168,
135
+ "grad_norm": 0.5880279541015625,
136
+ "learning_rate": 4.742647058823529e-05,
137
+ "loss": 0.15989291667938232,
138
+ "mean_token_accuracy": 0.9421182632446289,
139
+ "num_tokens": 2671024.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1273940205574036,
144
+ "epoch": 0.2578268876611418,
145
+ "grad_norm": 0.612959086894989,
146
+ "learning_rate": 5.110294117647059e-05,
147
+ "loss": 0.14701461791992188,
148
+ "mean_token_accuracy": 0.9463540315628052,
149
+ "num_tokens": 2876848.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1263513088226318,
154
+ "epoch": 0.27624309392265195,
155
+ "grad_norm": 0.5695255398750305,
156
+ "learning_rate": 5.477941176470589e-05,
157
+ "loss": 0.14604382514953612,
158
+ "mean_token_accuracy": 0.946351945400238,
159
+ "num_tokens": 3082589.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1290789365768432,
164
+ "epoch": 0.2946593001841621,
165
+ "grad_norm": 0.6608090996742249,
166
+ "learning_rate": 5.845588235294118e-05,
167
+ "loss": 0.1409450054168701,
168
+ "mean_token_accuracy": 0.9481450319290161,
169
+ "num_tokens": 3287459.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1291529774665832,
174
+ "epoch": 0.31307550644567217,
175
+ "grad_norm": 0.652715802192688,
176
+ "learning_rate": 6.213235294117647e-05,
177
+ "loss": 0.14441155195236205,
178
+ "mean_token_accuracy": 0.9466125547885895,
179
+ "num_tokens": 3493682.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1244838953018188,
184
+ "epoch": 0.3314917127071823,
185
+ "grad_norm": 0.7815241813659668,
186
+ "learning_rate": 6.580882352941177e-05,
187
+ "loss": 0.13361064195632935,
188
+ "mean_token_accuracy": 0.9512295544147491,
189
+ "num_tokens": 3699573.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1217721104621887,
194
+ "epoch": 0.34990791896869244,
195
+ "grad_norm": 0.7933160066604614,
196
+ "learning_rate": 6.948529411764706e-05,
197
+ "loss": 0.13089522123336791,
198
+ "mean_token_accuracy": 0.9520221531391144,
199
+ "num_tokens": 3905156.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1206679105758668,
204
+ "epoch": 0.3683241252302026,
205
+ "grad_norm": 0.6815240383148193,
206
+ "learning_rate": 7.316176470588236e-05,
207
+ "loss": 0.13400404453277587,
208
+ "mean_token_accuracy": 0.9501322209835052,
209
+ "num_tokens": 4110570.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1161052227020263,
214
+ "epoch": 0.3867403314917127,
215
+ "grad_norm": 0.8297767639160156,
216
+ "learning_rate": 7.683823529411766e-05,
217
+ "loss": 0.13389937877655028,
218
+ "mean_token_accuracy": 0.9501932203769684,
219
+ "num_tokens": 4315834.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1098745942115784,
224
+ "epoch": 0.40515653775322286,
225
+ "grad_norm": 0.5943381786346436,
226
+ "learning_rate": 8.051470588235294e-05,
227
+ "loss": 0.13452907800674438,
228
+ "mean_token_accuracy": 0.9503286242485046,
229
+ "num_tokens": 4520807.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.100480353832245,
234
+ "epoch": 0.42357274401473294,
235
+ "grad_norm": 0.6094359755516052,
236
+ "learning_rate": 8.419117647058824e-05,
237
+ "loss": 0.12827746868133544,
238
+ "mean_token_accuracy": 0.952492094039917,
239
+ "num_tokens": 4725867.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.0901286959648133,
244
+ "epoch": 0.4419889502762431,
245
+ "grad_norm": 0.7240597605705261,
246
+ "learning_rate": 8.786764705882353e-05,
247
+ "loss": 0.12171242237091065,
248
+ "mean_token_accuracy": 0.953943532705307,
249
+ "num_tokens": 4931629.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.0885071873664856,
254
+ "epoch": 0.4604051565377532,
255
+ "grad_norm": 0.6939547657966614,
256
+ "learning_rate": 9.154411764705882e-05,
257
+ "loss": 0.12155698537826538,
258
+ "mean_token_accuracy": 0.9545870959758759,
259
+ "num_tokens": 5137285.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.086272156238556,
264
+ "epoch": 0.47882136279926335,
265
+ "grad_norm": 0.5752800703048706,
266
+ "learning_rate": 9.522058823529412e-05,
267
+ "loss": 0.12157790660858155,
268
+ "mean_token_accuracy": 0.9541126549243927,
269
+ "num_tokens": 5342575.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.0857678413391114,
274
+ "epoch": 0.4972375690607735,
275
+ "grad_norm": 0.7565123438835144,
276
+ "learning_rate": 9.889705882352942e-05,
277
+ "loss": 0.12349612712860107,
278
+ "mean_token_accuracy": 0.9535140514373779,
279
+ "num_tokens": 5547995.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.079762625694275,
284
+ "epoch": 0.5156537753222836,
285
+ "grad_norm": 0.6972768306732178,
286
+ "learning_rate": 9.999954556423843e-05,
287
+ "loss": 0.11875582933425903,
288
+ "mean_token_accuracy": 0.9556483089923858,
289
+ "num_tokens": 5753195.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.0742079138755798,
294
+ "epoch": 0.5340699815837937,
295
+ "grad_norm": 0.7821696996688843,
296
+ "learning_rate": 9.999731977631227e-05,
297
+ "loss": 0.11824090480804443,
298
+ "mean_token_accuracy": 0.9557521045207977,
299
+ "num_tokens": 5958236.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.0679773569107056,
304
+ "epoch": 0.5524861878453039,
305
+ "grad_norm": 0.5846888422966003,
306
+ "learning_rate": 9.999323925089486e-05,
307
+ "loss": 0.11707355976104736,
308
+ "mean_token_accuracy": 0.9554719448089599,
309
+ "num_tokens": 6163992.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.0655727863311768,
314
+ "epoch": 0.570902394106814,
315
+ "grad_norm": 0.5812502503395081,
316
+ "learning_rate": 9.998730413936037e-05,
317
+ "loss": 0.11371417045593261,
318
+ "mean_token_accuracy": 0.9576376020908356,
319
+ "num_tokens": 6369456.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.0607039332389832,
324
+ "epoch": 0.5893186003683242,
325
+ "grad_norm": 0.6238475441932678,
326
+ "learning_rate": 9.99795146618821e-05,
327
+ "loss": 0.11775733232498169,
328
+ "mean_token_accuracy": 0.9557221591472626,
329
+ "num_tokens": 6574833.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.0504255175590516,
334
+ "epoch": 0.6077348066298343,
335
+ "grad_norm": 0.6496815085411072,
336
+ "learning_rate": 9.996987110742422e-05,
337
+ "loss": 0.10904088020324706,
338
+ "mean_token_accuracy": 0.9585366368293762,
339
+ "num_tokens": 6780108.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.0456081986427308,
344
+ "epoch": 0.6261510128913443,
345
+ "grad_norm": 0.786702573299408,
346
+ "learning_rate": 9.995837383373119e-05,
347
+ "loss": 0.10642309188842773,
348
+ "mean_token_accuracy": 0.9596696078777314,
349
+ "num_tokens": 6985920.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.0455098271369934,
354
+ "epoch": 0.6445672191528545,
355
+ "grad_norm": 0.5473790168762207,
356
+ "learning_rate": 9.994502326731434e-05,
357
+ "loss": 0.10822961330413819,
358
+ "mean_token_accuracy": 0.959563136100769,
359
+ "num_tokens": 7191465.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.04240562915802,
364
+ "epoch": 0.6629834254143646,
365
+ "grad_norm": 0.6672356128692627,
366
+ "learning_rate": 9.992981990343614e-05,
367
+ "loss": 0.1110004186630249,
368
+ "mean_token_accuracy": 0.9582514643669129,
369
+ "num_tokens": 7396877.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0386811256408692,
374
+ "epoch": 0.6813996316758748,
375
+ "grad_norm": 0.698539674282074,
376
+ "learning_rate": 9.99127643060918e-05,
377
+ "loss": 0.107539963722229,
378
+ "mean_token_accuracy": 0.9593036234378814,
379
+ "num_tokens": 7602437.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.0311225533485413,
384
+ "epoch": 0.6998158379373849,
385
+ "grad_norm": 0.6629284024238586,
386
+ "learning_rate": 9.989385710798837e-05,
387
+ "loss": 0.1064023494720459,
388
+ "mean_token_accuracy": 0.9602205216884613,
389
+ "num_tokens": 7808142.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.030210506916046,
394
+ "epoch": 0.7182320441988951,
395
+ "grad_norm": 0.5616748929023743,
396
+ "learning_rate": 9.987309901052121e-05,
397
+ "loss": 0.10717041492462158,
398
+ "mean_token_accuracy": 0.9599347949028015,
399
+ "num_tokens": 8013407.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0208017826080322,
404
+ "epoch": 0.7366482504604052,
405
+ "grad_norm": 0.6329049468040466,
406
+ "learning_rate": 9.985049078374806e-05,
407
+ "loss": 0.10359601974487305,
408
+ "mean_token_accuracy": 0.9603756129741668,
409
+ "num_tokens": 8219040.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.015640377998352,
414
+ "epoch": 0.7550644567219152,
415
+ "grad_norm": 0.6516013741493225,
416
+ "learning_rate": 9.982603326636037e-05,
417
+ "loss": 0.10146439075469971,
418
+ "mean_token_accuracy": 0.9627702474594116,
419
+ "num_tokens": 8424678.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0105359435081482,
424
+ "epoch": 0.7734806629834254,
425
+ "grad_norm": 0.6920603513717651,
426
+ "learning_rate": 9.979972736565226e-05,
427
+ "loss": 0.10770498514175415,
428
+ "mean_token_accuracy": 0.9591470420360565,
429
+ "num_tokens": 8629868.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.9966452836990356,
434
+ "epoch": 0.7918968692449355,
435
+ "grad_norm": 0.6857476234436035,
436
+ "learning_rate": 9.977157405748687e-05,
437
+ "loss": 0.10282524824142455,
438
+ "mean_token_accuracy": 0.9612209022045135,
439
+ "num_tokens": 8835320.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.9945534646511078,
444
+ "epoch": 0.8103130755064457,
445
+ "grad_norm": 0.7208472490310669,
446
+ "learning_rate": 9.974157438626008e-05,
447
+ "loss": 0.10069938898086547,
448
+ "mean_token_accuracy": 0.9620070576667785,
449
+ "num_tokens": 9041123.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.979461395740509,
454
+ "epoch": 0.8287292817679558,
455
+ "grad_norm": 0.5071915984153748,
456
+ "learning_rate": 9.970972946486185e-05,
457
+ "loss": 0.09799174070358277,
458
+ "mean_token_accuracy": 0.9620374023914338,
459
+ "num_tokens": 9246361.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.9830998003482818,
464
+ "epoch": 0.8471454880294659,
465
+ "grad_norm": 0.8660802245140076,
466
+ "learning_rate": 9.967604047463493e-05,
467
+ "loss": 0.10378165245056152,
468
+ "mean_token_accuracy": 0.9606865763664245,
469
+ "num_tokens": 9451845.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.9813413023948669,
474
+ "epoch": 0.8655616942909761,
475
+ "grad_norm": 0.7642477750778198,
476
+ "learning_rate": 9.964050866533094e-05,
477
+ "loss": 0.1010061264038086,
478
+ "mean_token_accuracy": 0.9608745336532593,
479
+ "num_tokens": 9656802.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.967874163389206,
484
+ "epoch": 0.8839779005524862,
485
+ "grad_norm": 0.5987281799316406,
486
+ "learning_rate": 9.960313535506411e-05,
487
+ "loss": 0.10169394016265869,
488
+ "mean_token_accuracy": 0.9611998200416565,
489
+ "num_tokens": 9861719.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.9663491308689117,
494
+ "epoch": 0.9023941068139963,
495
+ "grad_norm": 0.6124638319015503,
496
+ "learning_rate": 9.956392193026239e-05,
497
+ "loss": 0.102389657497406,
498
+ "mean_token_accuracy": 0.9611884355545044,
499
+ "num_tokens": 10066673.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.959654438495636,
504
+ "epoch": 0.9208103130755064,
505
+ "grad_norm": 0.7873051762580872,
506
+ "learning_rate": 9.952286984561592e-05,
507
+ "loss": 0.10170392990112305,
508
+ "mean_token_accuracy": 0.9610928475856781,
509
+ "num_tokens": 10272091.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.9550537407398224,
514
+ "epoch": 0.9392265193370166,
515
+ "grad_norm": 0.6071968078613281,
516
+ "learning_rate": 9.947998062402313e-05,
517
+ "loss": 0.09448277950286865,
518
+ "mean_token_accuracy": 0.9648977637290954,
519
+ "num_tokens": 10477632.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.9538533687591553,
524
+ "epoch": 0.9576427255985267,
525
+ "grad_norm": 0.6317242980003357,
526
+ "learning_rate": 9.943525585653428e-05,
527
+ "loss": 0.09542192220687866,
528
+ "mean_token_accuracy": 0.9635261118412017,
529
+ "num_tokens": 10682828.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.9362513542175293,
534
+ "epoch": 0.9760589318600368,
535
+ "grad_norm": 0.6421944499015808,
536
+ "learning_rate": 9.938869720229234e-05,
537
+ "loss": 0.09382058382034301,
538
+ "mean_token_accuracy": 0.9648073971271515,
539
+ "num_tokens": 10888741.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.9235438346862793,
544
+ "epoch": 0.994475138121547,
545
+ "grad_norm": 0.7986873388290405,
546
+ "learning_rate": 9.934030638847155e-05,
547
+ "loss": 0.09827429056167603,
548
+ "mean_token_accuracy": 0.9621128737926483,
549
+ "num_tokens": 11094387.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "epoch": 1.0,
554
+ "eval_entropy": 0.9137652366057686,
555
+ "eval_loss": 0.09368764609098434,
556
+ "eval_mean_token_accuracy": 0.9640816880309063,
557
+ "eval_num_tokens": 11155908.0,
558
+ "eval_runtime": 10.4701,
559
+ "eval_samples_per_second": 349.377,
560
+ "eval_steps_per_second": 10.984,
561
+ "step": 543
562
+ },
563
+ {
564
+ "entropy": 0.9047818422317505,
565
+ "epoch": 1.0128913443830572,
566
+ "grad_norm": 0.6781501173973083,
567
+ "learning_rate": 9.929008521021325e-05,
568
+ "loss": 0.0863916516304016,
569
+ "mean_token_accuracy": 0.9673655688762665,
570
+ "num_tokens": 11299715.0,
571
+ "step": 550
572
+ },
573
+ {
574
+ "entropy": 0.8856981039047241,
575
+ "epoch": 1.0313075506445673,
576
+ "grad_norm": 0.7143136858940125,
577
+ "learning_rate": 9.923803553055937e-05,
578
+ "loss": 0.08632323145866394,
579
+ "mean_token_accuracy": 0.9677783191204071,
580
+ "num_tokens": 11505059.0,
581
+ "step": 560
582
+ },
583
+ {
584
+ "entropy": 0.8937099635601043,
585
+ "epoch": 1.0497237569060773,
586
+ "grad_norm": 0.7751694321632385,
587
+ "learning_rate": 9.918415928038325e-05,
588
+ "loss": 0.08178263902664185,
589
+ "mean_token_accuracy": 0.9694291114807129,
590
+ "num_tokens": 11710464.0,
591
+ "step": 570
592
+ },
593
+ {
594
+ "entropy": 0.8858704209327698,
595
+ "epoch": 1.0681399631675874,
596
+ "grad_norm": 0.7492292523384094,
597
+ "learning_rate": 9.912845845831805e-05,
598
+ "loss": 0.08074211478233337,
599
+ "mean_token_accuracy": 0.9692470014095307,
600
+ "num_tokens": 11915959.0,
601
+ "step": 580
602
+ },
603
+ {
604
+ "entropy": 0.8948039829730987,
605
+ "epoch": 1.0865561694290977,
606
+ "grad_norm": 0.8116479516029358,
607
+ "learning_rate": 9.907093513068259e-05,
608
+ "loss": 0.08712012171745301,
609
+ "mean_token_accuracy": 0.9669980227947235,
610
+ "num_tokens": 12121499.0,
611
+ "step": 590
612
+ },
613
+ {
614
+ "entropy": 0.8846789538860321,
615
+ "epoch": 1.1049723756906078,
616
+ "grad_norm": 0.7295626997947693,
617
+ "learning_rate": 9.901159143140471e-05,
618
+ "loss": 0.08444435596466064,
619
+ "mean_token_accuracy": 0.9674544095993042,
620
+ "num_tokens": 12327061.0,
621
+ "step": 600
622
+ },
623
+ {
624
+ "entropy": 0.8734103918075562,
625
+ "epoch": 1.1233885819521179,
626
+ "grad_norm": 0.9585768580436707,
627
+ "learning_rate": 9.89504295619421e-05,
628
+ "loss": 0.08022565841674804,
629
+ "mean_token_accuracy": 0.969569206237793,
630
+ "num_tokens": 12532305.0,
631
+ "step": 610
632
+ },
633
+ {
634
+ "entropy": 0.8640486001968384,
635
+ "epoch": 1.141804788213628,
636
+ "grad_norm": 0.7891159057617188,
637
+ "learning_rate": 9.88874517912006e-05,
638
+ "loss": 0.08415375947952271,
639
+ "mean_token_accuracy": 0.9678892493247986,
640
+ "num_tokens": 12737828.0,
641
+ "step": 620
642
+ },
643
+ {
644
+ "entropy": 0.8599755525588989,
645
+ "epoch": 1.160220994475138,
646
+ "grad_norm": 0.5801345109939575,
647
+ "learning_rate": 9.882266045545012e-05,
648
+ "loss": 0.08100489974021911,
649
+ "mean_token_accuracy": 0.9688023269176483,
650
+ "num_tokens": 12943343.0,
651
+ "step": 630
652
+ },
653
+ {
654
+ "entropy": 0.86524977684021,
655
+ "epoch": 1.1786372007366483,
656
+ "grad_norm": 0.7633041143417358,
657
+ "learning_rate": 9.87560579582379e-05,
658
+ "loss": 0.07859406471252442,
659
+ "mean_token_accuracy": 0.9702189445495606,
660
+ "num_tokens": 13148473.0,
661
+ "step": 640
662
+ },
663
+ {
664
+ "entropy": 0.8466695249080658,
665
+ "epoch": 1.1970534069981584,
666
+ "grad_norm": 0.8672215938568115,
667
+ "learning_rate": 9.868764677029934e-05,
668
+ "loss": 0.08082623481750488,
669
+ "mean_token_accuracy": 0.9689972400665283,
670
+ "num_tokens": 13353890.0,
671
+ "step": 650
672
+ },
673
+ {
674
+ "entropy": 0.8596941530704498,
675
+ "epoch": 1.2154696132596685,
676
+ "grad_norm": 0.7524124383926392,
677
+ "learning_rate": 9.861742942946639e-05,
678
+ "loss": 0.0789935290813446,
679
+ "mean_token_accuracy": 0.9693858206272126,
680
+ "num_tokens": 13559475.0,
681
+ "step": 660
682
+ },
683
+ {
684
+ "entropy": 0.8708749234676361,
685
+ "epoch": 1.2338858195211786,
686
+ "grad_norm": 0.5777031183242798,
687
+ "learning_rate": 9.854540854057337e-05,
688
+ "loss": 0.07773642539978028,
689
+ "mean_token_accuracy": 0.970385092496872,
690
+ "num_tokens": 13765076.0,
691
+ "step": 670
692
+ },
693
+ {
694
+ "entropy": 0.8651713371276856,
695
+ "epoch": 1.2523020257826887,
696
+ "grad_norm": 0.7924166321754456,
697
+ "learning_rate": 9.847158677536034e-05,
698
+ "loss": 0.0766686737537384,
699
+ "mean_token_accuracy": 0.9702267110347748,
700
+ "num_tokens": 13970642.0,
701
+ "step": 680
702
+ },
703
+ {
704
+ "entropy": 0.8763024985790253,
705
+ "epoch": 1.270718232044199,
706
+ "grad_norm": 0.741219162940979,
707
+ "learning_rate": 9.839596687237403e-05,
708
+ "loss": 0.07189929485321045,
709
+ "mean_token_accuracy": 0.9727097094058991,
710
+ "num_tokens": 14176556.0,
711
+ "step": 690
712
+ },
713
+ {
714
+ "entropy": 0.8556921362876893,
715
+ "epoch": 1.289134438305709,
716
+ "grad_norm": 0.6298198103904724,
717
+ "learning_rate": 9.831855163686618e-05,
718
+ "loss": 0.07608137726783752,
719
+ "mean_token_accuracy": 0.9716399371623993,
720
+ "num_tokens": 14381686.0,
721
+ "step": 700
722
+ },
723
+ {
724
+ "entropy": 0.869178420305252,
725
+ "epoch": 1.3075506445672191,
726
+ "grad_norm": 0.5850273370742798,
727
+ "learning_rate": 9.823934394068952e-05,
728
+ "loss": 0.07437651753425598,
729
+ "mean_token_accuracy": 0.9709566533565521,
730
+ "num_tokens": 14586814.0,
731
+ "step": 710
732
+ },
733
+ {
734
+ "entropy": 0.8708595156669616,
735
+ "epoch": 1.3259668508287292,
736
+ "grad_norm": 0.6580632328987122,
737
+ "learning_rate": 9.815834672219127e-05,
738
+ "loss": 0.07518917322158813,
739
+ "mean_token_accuracy": 0.9717426657676697,
740
+ "num_tokens": 14792321.0,
741
+ "step": 720
742
+ },
743
+ {
744
+ "entropy": 0.8826817810535431,
745
+ "epoch": 1.3443830570902393,
746
+ "grad_norm": 0.8788532018661499,
747
+ "learning_rate": 9.807556298610404e-05,
748
+ "loss": 0.07579240798950196,
749
+ "mean_token_accuracy": 0.9706341981887817,
750
+ "num_tokens": 14997810.0,
751
+ "step": 730
752
+ },
753
+ {
754
+ "entropy": 0.9012470185756684,
755
+ "epoch": 1.3627992633517496,
756
+ "grad_norm": 0.7022138237953186,
757
+ "learning_rate": 9.799099580343441e-05,
758
+ "loss": 0.0775588572025299,
759
+ "mean_token_accuracy": 0.9699241399765015,
760
+ "num_tokens": 15203795.0,
761
+ "step": 740
762
+ },
763
+ {
764
+ "entropy": 0.886955714225769,
765
+ "epoch": 1.3812154696132597,
766
+ "grad_norm": 0.7881133556365967,
767
+ "learning_rate": 9.790464831134903e-05,
768
+ "loss": 0.07125020027160645,
769
+ "mean_token_accuracy": 0.9723815560340882,
770
+ "num_tokens": 15408974.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9047374844551086,
775
+ "epoch": 1.3996316758747698,
776
+ "grad_norm": 0.9082005023956299,
777
+ "learning_rate": 9.781652371305824e-05,
778
+ "loss": 0.07004334926605224,
779
+ "mean_token_accuracy": 0.9725580036640167,
780
+ "num_tokens": 15614399.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9039053857326508,
785
+ "epoch": 1.4180478821362799,
786
+ "grad_norm": 0.8060817122459412,
787
+ "learning_rate": 9.77266252776972e-05,
788
+ "loss": 0.07103485465049744,
789
+ "mean_token_accuracy": 0.9721468150615692,
790
+ "num_tokens": 15819895.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.8998047232627868,
795
+ "epoch": 1.43646408839779,
796
+ "grad_norm": 1.0152642726898193,
797
+ "learning_rate": 9.763495634020467e-05,
798
+ "loss": 0.07411704063415528,
799
+ "mean_token_accuracy": 0.9711063146591187,
800
+ "num_tokens": 16025297.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.9120213568210602,
805
+ "epoch": 1.4548802946593002,
806
+ "grad_norm": 0.6288319826126099,
807
+ "learning_rate": 9.754152030119921e-05,
808
+ "loss": 0.07223712205886841,
809
+ "mean_token_accuracy": 0.9722476422786712,
810
+ "num_tokens": 16230656.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.9142370820045471,
815
+ "epoch": 1.4732965009208103,
816
+ "grad_norm": 0.7854700088500977,
817
+ "learning_rate": 9.744632062685311e-05,
818
+ "loss": 0.07186744809150696,
819
+ "mean_token_accuracy": 0.972247713804245,
820
+ "num_tokens": 16435943.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.8920814216136932,
825
+ "epoch": 1.4917127071823204,
826
+ "grad_norm": 0.6227074265480042,
827
+ "learning_rate": 9.734936084876383e-05,
828
+ "loss": 0.07016961574554444,
829
+ "mean_token_accuracy": 0.9725603640079499,
830
+ "num_tokens": 16641635.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.891328877210617,
835
+ "epoch": 1.5101289134438307,
836
+ "grad_norm": 0.7601346969604492,
837
+ "learning_rate": 9.725064456382283e-05,
838
+ "loss": 0.07137494087219239,
839
+ "mean_token_accuracy": 0.9722997546195984,
840
+ "num_tokens": 16847194.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.8921217978000641,
845
+ "epoch": 1.5285451197053406,
846
+ "grad_norm": 0.7813850045204163,
847
+ "learning_rate": 9.715017543408233e-05,
848
+ "loss": 0.06890199184417725,
849
+ "mean_token_accuracy": 0.9735044002532959,
850
+ "num_tokens": 17052807.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.9085914671421051,
855
+ "epoch": 1.5469613259668509,
856
+ "grad_norm": 0.6184289455413818,
857
+ "learning_rate": 9.704795718661939e-05,
858
+ "loss": 0.07043765187263488,
859
+ "mean_token_accuracy": 0.9725716531276702,
860
+ "num_tokens": 17258284.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9029861629009247,
865
+ "epoch": 1.565377532228361,
866
+ "grad_norm": 0.7082377076148987,
867
+ "learning_rate": 9.694399361339752e-05,
868
+ "loss": 0.07113839387893676,
869
+ "mean_token_accuracy": 0.9725669205188752,
870
+ "num_tokens": 17464326.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.8856533527374267,
875
+ "epoch": 1.583793738489871,
876
+ "grad_norm": 0.7409216165542603,
877
+ "learning_rate": 9.683828857112627e-05,
878
+ "loss": 0.07077333331108093,
879
+ "mean_token_accuracy": 0.9731084644794464,
880
+ "num_tokens": 17669537.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.8613030433654785,
885
+ "epoch": 1.6022099447513813,
886
+ "grad_norm": 0.6801561713218689,
887
+ "learning_rate": 9.673084598111789e-05,
888
+ "loss": 0.06885308027267456,
889
+ "mean_token_accuracy": 0.97266526222229,
890
+ "num_tokens": 17875289.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.8692965865135193,
895
+ "epoch": 1.6206261510128912,
896
+ "grad_norm": 1.1621277332305908,
897
+ "learning_rate": 9.662166982914203e-05,
898
+ "loss": 0.07017780542373657,
899
+ "mean_token_accuracy": 0.9733059942722321,
900
+ "num_tokens": 18080404.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.8671502113342285,
905
+ "epoch": 1.6390423572744015,
906
+ "grad_norm": 0.7518903613090515,
907
+ "learning_rate": 9.651076416527787e-05,
908
+ "loss": 0.06977018713951111,
909
+ "mean_token_accuracy": 0.9730017304420471,
910
+ "num_tokens": 18285699.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.8662045657634735,
915
+ "epoch": 1.6574585635359116,
916
+ "grad_norm": 0.6622698903083801,
917
+ "learning_rate": 9.639813310376378e-05,
918
+ "loss": 0.06620995998382569,
919
+ "mean_token_accuracy": 0.9737491130828857,
920
+ "num_tokens": 18491097.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.8548173069953918,
925
+ "epoch": 1.6758747697974217,
926
+ "grad_norm": 0.8941843509674072,
927
+ "learning_rate": 9.628378082284479e-05,
928
+ "loss": 0.06711119413375854,
929
+ "mean_token_accuracy": 0.9740589797496796,
930
+ "num_tokens": 18696827.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.8763562262058258,
935
+ "epoch": 1.694290976058932,
936
+ "grad_norm": 0.7571700215339661,
937
+ "learning_rate": 9.616771156461755e-05,
938
+ "loss": 0.07263468503952027,
939
+ "mean_token_accuracy": 0.9717419981956482,
940
+ "num_tokens": 18902513.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.8663733780384064,
945
+ "epoch": 1.7127071823204418,
946
+ "grad_norm": 0.7886489629745483,
947
+ "learning_rate": 9.604992963487298e-05,
948
+ "loss": 0.07074605226516724,
949
+ "mean_token_accuracy": 0.9724965393543243,
950
+ "num_tokens": 19107812.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.8673004627227783,
955
+ "epoch": 1.7311233885819521,
956
+ "grad_norm": 0.8180726170539856,
957
+ "learning_rate": 9.593043940293647e-05,
958
+ "loss": 0.06831735372543335,
959
+ "mean_token_accuracy": 0.9733696818351746,
960
+ "num_tokens": 19313330.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.8525971233844757,
965
+ "epoch": 1.7495395948434622,
966
+ "grad_norm": 0.6576228737831116,
967
+ "learning_rate": 9.580924530150595e-05,
968
+ "loss": 0.06567002534866333,
969
+ "mean_token_accuracy": 0.9745754361152649,
970
+ "num_tokens": 19518671.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.8605451703071594,
975
+ "epoch": 1.7679558011049723,
976
+ "grad_norm": 0.7171661257743835,
977
+ "learning_rate": 9.568635182648725e-05,
978
+ "loss": 0.06872050762176514,
979
+ "mean_token_accuracy": 0.9732091546058654,
980
+ "num_tokens": 19724135.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.8642210960388184,
985
+ "epoch": 1.7863720073664826,
986
+ "grad_norm": 0.7603147029876709,
987
+ "learning_rate": 9.556176353682746e-05,
988
+ "loss": 0.06766576766967773,
989
+ "mean_token_accuracy": 0.9728681743144989,
990
+ "num_tokens": 19928785.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.8543185651302337,
995
+ "epoch": 1.8047882136279927,
996
+ "grad_norm": 0.7280875444412231,
997
+ "learning_rate": 9.543548505434581e-05,
998
+ "loss": 0.06851862668991089,
999
+ "mean_token_accuracy": 0.9737437188625335,
1000
+ "num_tokens": 20134195.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.8744745373725891,
1005
+ "epoch": 1.8232044198895028,
1006
+ "grad_norm": 0.5897248983383179,
1007
+ "learning_rate": 9.530752106356209e-05,
1008
+ "loss": 0.06809053421020508,
1009
+ "mean_token_accuracy": 0.9733593761920929,
1010
+ "num_tokens": 20339517.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.8623859465122223,
1015
+ "epoch": 1.8416206261510129,
1016
+ "grad_norm": 0.7515265345573425,
1017
+ "learning_rate": 9.517787631152298e-05,
1018
+ "loss": 0.07257847785949707,
1019
+ "mean_token_accuracy": 0.9714054942131043,
1020
+ "num_tokens": 20545249.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.8669404804706573,
1025
+ "epoch": 1.860036832412523,
1026
+ "grad_norm": 0.7144560813903809,
1027
+ "learning_rate": 9.504655560762596e-05,
1028
+ "loss": 0.06832354068756104,
1029
+ "mean_token_accuracy": 0.9735779523849487,
1030
+ "num_tokens": 20750507.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.8493516445159912,
1035
+ "epoch": 1.8784530386740332,
1036
+ "grad_norm": 0.6559189558029175,
1037
+ "learning_rate": 9.491356382344081e-05,
1038
+ "loss": 0.0629766047000885,
1039
+ "mean_token_accuracy": 0.9754977762699127,
1040
+ "num_tokens": 20955956.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.8599376022815705,
1045
+ "epoch": 1.8968692449355433,
1046
+ "grad_norm": 0.6792973279953003,
1047
+ "learning_rate": 9.477890589252895e-05,
1048
+ "loss": 0.0666757881641388,
1049
+ "mean_token_accuracy": 0.974083811044693,
1050
+ "num_tokens": 21161163.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.8458438158035279,
1055
+ "epoch": 1.9152854511970534,
1056
+ "grad_norm": 0.6941778659820557,
1057
+ "learning_rate": 9.464258681026042e-05,
1058
+ "loss": 0.06307152509689332,
1059
+ "mean_token_accuracy": 0.9757042229175568,
1060
+ "num_tokens": 21366525.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.848515909910202,
1065
+ "epoch": 1.9337016574585635,
1066
+ "grad_norm": 0.7307806611061096,
1067
+ "learning_rate": 9.450461163362855e-05,
1068
+ "loss": 0.06307026147842407,
1069
+ "mean_token_accuracy": 0.9750974595546722,
1070
+ "num_tokens": 21572238.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.8563454031944275,
1075
+ "epoch": 1.9521178637200736,
1076
+ "grad_norm": 0.7222106456756592,
1077
+ "learning_rate": 9.436498548106236e-05,
1078
+ "loss": 0.0647726058959961,
1079
+ "mean_token_accuracy": 0.974629694223404,
1080
+ "num_tokens": 21777633.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.8656457483768463,
1085
+ "epoch": 1.9705340699815839,
1086
+ "grad_norm": 0.67178875207901,
1087
+ "learning_rate": 9.422371353223674e-05,
1088
+ "loss": 0.06573554277420043,
1089
+ "mean_token_accuracy": 0.9745908617973328,
1090
+ "num_tokens": 21983116.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.8630891263484954,
1095
+ "epoch": 1.988950276243094,
1096
+ "grad_norm": 0.6956593990325928,
1097
+ "learning_rate": 9.408080102788016e-05,
1098
+ "loss": 0.06630704402923585,
1099
+ "mean_token_accuracy": 0.9741333484649658,
1100
+ "num_tokens": 22188662.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "epoch": 2.0,
1105
+ "eval_entropy": 0.8560857042022373,
1106
+ "eval_loss": 0.06494329869747162,
1107
+ "eval_mean_token_accuracy": 0.9745692672936813,
1108
+ "eval_num_tokens": 22311800.0,
1109
+ "eval_runtime": 10.129,
1110
+ "eval_samples_per_second": 361.142,
1111
+ "eval_steps_per_second": 11.354,
1112
+ "step": 1086
1113
+ },
1114
+ {
1115
+ "entropy": 0.8616272270679474,
1116
+ "epoch": 2.007366482504604,
1117
+ "grad_norm": 0.7778105139732361,
1118
+ "learning_rate": 9.393625326958041e-05,
1119
+ "loss": 0.054407155513763426,
1120
+ "mean_token_accuracy": 0.9792074799537659,
1121
+ "num_tokens": 22394215.0,
1122
+ "step": 1090
1123
+ },
1124
+ {
1125
+ "entropy": 0.8496910452842712,
1126
+ "epoch": 2.0257826887661143,
1127
+ "grad_norm": 0.7422528266906738,
1128
+ "learning_rate": 9.379007561958792e-05,
1129
+ "loss": 0.051881587505340575,
1130
+ "mean_token_accuracy": 0.9799090325832367,
1131
+ "num_tokens": 22599599.0,
1132
+ "step": 1100
1133
+ },
1134
+ {
1135
+ "entropy": 0.8531602442264556,
1136
+ "epoch": 2.044198895027624,
1137
+ "grad_norm": 0.9075332880020142,
1138
+ "learning_rate": 9.36422735006167e-05,
1139
+ "loss": 0.05190724730491638,
1140
+ "mean_token_accuracy": 0.979931116104126,
1141
+ "num_tokens": 22805318.0,
1142
+ "step": 1110
1143
+ },
1144
+ {
1145
+ "entropy": 0.8657277703285218,
1146
+ "epoch": 2.0626151012891345,
1147
+ "grad_norm": 0.9466913938522339,
1148
+ "learning_rate": 9.349285239564325e-05,
1149
+ "loss": 0.053853434324264524,
1150
+ "mean_token_accuracy": 0.9796103596687317,
1151
+ "num_tokens": 23010438.0,
1152
+ "step": 1120
1153
+ },
1154
+ {
1155
+ "entropy": 0.8578485429286957,
1156
+ "epoch": 2.0810313075506444,
1157
+ "grad_norm": 0.6903054714202881,
1158
+ "learning_rate": 9.334181784770326e-05,
1159
+ "loss": 0.05228850841522217,
1160
+ "mean_token_accuracy": 0.9802409887313843,
1161
+ "num_tokens": 23215795.0,
1162
+ "step": 1130
1163
+ },
1164
+ {
1165
+ "entropy": 0.8450767934322357,
1166
+ "epoch": 2.0994475138121547,
1167
+ "grad_norm": 0.6615211367607117,
1168
+ "learning_rate": 9.318917545968581e-05,
1169
+ "loss": 0.050570905208587646,
1170
+ "mean_token_accuracy": 0.9802053451538086,
1171
+ "num_tokens": 23421157.0,
1172
+ "step": 1140
1173
+ },
1174
+ {
1175
+ "entropy": 0.8325044393539429,
1176
+ "epoch": 2.117863720073665,
1177
+ "grad_norm": 0.760960578918457,
1178
+ "learning_rate": 9.303493089412564e-05,
1179
+ "loss": 0.051966112852096555,
1180
+ "mean_token_accuracy": 0.9796205997467041,
1181
+ "num_tokens": 23626584.0,
1182
+ "step": 1150
1183
+ },
1184
+ {
1185
+ "entropy": 0.8416404843330383,
1186
+ "epoch": 2.136279926335175,
1187
+ "grad_norm": 0.6947009563446045,
1188
+ "learning_rate": 9.287908987299306e-05,
1189
+ "loss": 0.05144861936569214,
1190
+ "mean_token_accuracy": 0.9800034642219544,
1191
+ "num_tokens": 23832137.0,
1192
+ "step": 1160
1193
+ },
1194
+ {
1195
+ "entropy": 0.8564540028572083,
1196
+ "epoch": 2.154696132596685,
1197
+ "grad_norm": 0.733252763748169,
1198
+ "learning_rate": 9.272165817748164e-05,
1199
+ "loss": 0.04944799542427063,
1200
+ "mean_token_accuracy": 0.9808157980442047,
1201
+ "num_tokens": 24038006.0,
1202
+ "step": 1170
1203
+ },
1204
+ {
1205
+ "entropy": 0.8575525343418121,
1206
+ "epoch": 2.1731123388581954,
1207
+ "grad_norm": 0.8911028504371643,
1208
+ "learning_rate": 9.25626416477938e-05,
1209
+ "loss": 0.05037952661514282,
1210
+ "mean_token_accuracy": 0.980946284532547,
1211
+ "num_tokens": 24243374.0,
1212
+ "step": 1180
1213
+ },
1214
+ {
1215
+ "entropy": 0.8599720418453216,
1216
+ "epoch": 2.1915285451197053,
1217
+ "grad_norm": 0.7713524103164673,
1218
+ "learning_rate": 9.240204618292416e-05,
1219
+ "loss": 0.050603735446929934,
1220
+ "mean_token_accuracy": 0.980896121263504,
1221
+ "num_tokens": 24448585.0,
1222
+ "step": 1190
1223
+ },
1224
+ {
1225
+ "entropy": 0.8566664934158326,
1226
+ "epoch": 2.2099447513812156,
1227
+ "grad_norm": 0.8439353704452515,
1228
+ "learning_rate": 9.223987774044066e-05,
1229
+ "loss": 0.054171699285507205,
1230
+ "mean_token_accuracy": 0.9796543836593627,
1231
+ "num_tokens": 24653863.0,
1232
+ "step": 1200
1233
+ },
1234
+ {
1235
+ "entropy": 0.846601277589798,
1236
+ "epoch": 2.2283609576427255,
1237
+ "grad_norm": 0.7025637030601501,
1238
+ "learning_rate": 9.207614233626356e-05,
1239
+ "loss": 0.048924127221107484,
1240
+ "mean_token_accuracy": 0.9809681415557862,
1241
+ "num_tokens": 24859801.0,
1242
+ "step": 1210
1243
+ },
1244
+ {
1245
+ "entropy": 0.8564423739910125,
1246
+ "epoch": 2.2467771639042358,
1247
+ "grad_norm": 0.7788274884223938,
1248
+ "learning_rate": 9.191084604444233e-05,
1249
+ "loss": 0.05260283350944519,
1250
+ "mean_token_accuracy": 0.9793797850608825,
1251
+ "num_tokens": 25065368.0,
1252
+ "step": 1220
1253
+ },
1254
+ {
1255
+ "entropy": 0.865056723356247,
1256
+ "epoch": 2.265193370165746,
1257
+ "grad_norm": 0.8728818297386169,
1258
+ "learning_rate": 9.174399499693027e-05,
1259
+ "loss": 0.05016371011734009,
1260
+ "mean_token_accuracy": 0.9807134211063385,
1261
+ "num_tokens": 25270945.0,
1262
+ "step": 1230
1263
+ },
1264
+ {
1265
+ "entropy": 0.8642262935638427,
1266
+ "epoch": 2.283609576427256,
1267
+ "grad_norm": 1.0582489967346191,
1268
+ "learning_rate": 9.157559538335703e-05,
1269
+ "loss": 0.05316779017448425,
1270
+ "mean_token_accuracy": 0.9794209063053131,
1271
+ "num_tokens": 25476575.0,
1272
+ "step": 1240
1273
+ },
1274
+ {
1275
+ "entropy": 0.8677761554718018,
1276
+ "epoch": 2.3020257826887662,
1277
+ "grad_norm": 0.760109543800354,
1278
+ "learning_rate": 9.140565345079901e-05,
1279
+ "loss": 0.05115479230880737,
1280
+ "mean_token_accuracy": 0.9802310705184937,
1281
+ "num_tokens": 25682814.0,
1282
+ "step": 1250
1283
+ },
1284
+ {
1285
+ "entropy": 0.8592945456504821,
1286
+ "epoch": 2.320441988950276,
1287
+ "grad_norm": 0.6537907123565674,
1288
+ "learning_rate": 9.123417550354761e-05,
1289
+ "loss": 0.050543540716171266,
1290
+ "mean_token_accuracy": 0.9806945025920868,
1291
+ "num_tokens": 25887575.0,
1292
+ "step": 1260
1293
+ },
1294
+ {
1295
+ "entropy": 0.8692500293254852,
1296
+ "epoch": 2.3388581952117864,
1297
+ "grad_norm": 0.7771905064582825,
1298
+ "learning_rate": 9.106116790287541e-05,
1299
+ "loss": 0.049718713760375975,
1300
+ "mean_token_accuracy": 0.9805168390274048,
1301
+ "num_tokens": 26092950.0,
1302
+ "step": 1270
1303
+ },
1304
+ {
1305
+ "entropy": 0.8841261565685272,
1306
+ "epoch": 2.3572744014732967,
1307
+ "grad_norm": 0.7791076898574829,
1308
+ "learning_rate": 9.08866370668001e-05,
1309
+ "loss": 0.0527400553226471,
1310
+ "mean_token_accuracy": 0.9796754539012908,
1311
+ "num_tokens": 26298182.0,
1312
+ "step": 1280
1313
+ },
1314
+ {
1315
+ "entropy": 0.8675022900104523,
1316
+ "epoch": 2.3756906077348066,
1317
+ "grad_norm": 0.8481605648994446,
1318
+ "learning_rate": 9.07105894698464e-05,
1319
+ "loss": 0.05320838689804077,
1320
+ "mean_token_accuracy": 0.9792274832725525,
1321
+ "num_tokens": 26503425.0,
1322
+ "step": 1290
1323
+ },
1324
+ {
1325
+ "entropy": 0.8704026222229004,
1326
+ "epoch": 2.394106813996317,
1327
+ "grad_norm": 0.8235505819320679,
1328
+ "learning_rate": 9.053303164280602e-05,
1329
+ "loss": 0.055045205354690555,
1330
+ "mean_token_accuracy": 0.9788750648498535,
1331
+ "num_tokens": 26708755.0,
1332
+ "step": 1300
1333
+ },
1334
+ {
1335
+ "entropy": 0.8525134027004242,
1336
+ "epoch": 2.4125230202578267,
1337
+ "grad_norm": 0.7611598968505859,
1338
+ "learning_rate": 9.035397017249518e-05,
1339
+ "loss": 0.05029621124267578,
1340
+ "mean_token_accuracy": 0.9802757322788238,
1341
+ "num_tokens": 26914704.0,
1342
+ "step": 1310
1343
+ },
1344
+ {
1345
+ "entropy": 0.8630305290222168,
1346
+ "epoch": 2.430939226519337,
1347
+ "grad_norm": 0.790408194065094,
1348
+ "learning_rate": 9.017341170151041e-05,
1349
+ "loss": 0.04856040775775909,
1350
+ "mean_token_accuracy": 0.9809690833091735,
1351
+ "num_tokens": 27120151.0,
1352
+ "step": 1320
1353
+ },
1354
+ {
1355
+ "entropy": 0.8579159140586853,
1356
+ "epoch": 2.4493554327808473,
1357
+ "grad_norm": 0.781972348690033,
1358
+ "learning_rate": 8.999136292798207e-05,
1359
+ "loss": 0.04869682788848877,
1360
+ "mean_token_accuracy": 0.9816130697727203,
1361
+ "num_tokens": 27325673.0,
1362
+ "step": 1330
1363
+ },
1364
+ {
1365
+ "entropy": 0.8634716987609863,
1366
+ "epoch": 2.467771639042357,
1367
+ "grad_norm": 0.8500784039497375,
1368
+ "learning_rate": 8.980783060532588e-05,
1369
+ "loss": 0.05050289034843445,
1370
+ "mean_token_accuracy": 0.980079609155655,
1371
+ "num_tokens": 27531270.0,
1372
+ "step": 1340
1373
+ },
1374
+ {
1375
+ "entropy": 0.8660618126392364,
1376
+ "epoch": 2.4861878453038675,
1377
+ "grad_norm": 0.719760537147522,
1378
+ "learning_rate": 8.96228215419924e-05,
1379
+ "loss": 0.04892141819000244,
1380
+ "mean_token_accuracy": 0.9814020991325378,
1381
+ "num_tokens": 27736542.0,
1382
+ "step": 1350
1383
+ },
1384
+ {
1385
+ "entropy": 0.8572284400463104,
1386
+ "epoch": 2.5046040515653774,
1387
+ "grad_norm": 1.0197229385375977,
1388
+ "learning_rate": 8.943634260121442e-05,
1389
+ "loss": 0.05104702711105347,
1390
+ "mean_token_accuracy": 0.9798846662044525,
1391
+ "num_tokens": 27941566.0,
1392
+ "step": 1360
1393
+ },
1394
+ {
1395
+ "entropy": 0.8702241241931915,
1396
+ "epoch": 2.5230202578268877,
1397
+ "grad_norm": 0.7136003375053406,
1398
+ "learning_rate": 8.924840070075247e-05,
1399
+ "loss": 0.04855787754058838,
1400
+ "mean_token_accuracy": 0.9811685383319855,
1401
+ "num_tokens": 28146943.0,
1402
+ "step": 1370
1403
+ },
1404
+ {
1405
+ "entropy": 0.874957013130188,
1406
+ "epoch": 2.541436464088398,
1407
+ "grad_norm": 0.8775497674942017,
1408
+ "learning_rate": 8.905900281263804e-05,
1409
+ "loss": 0.052434295415878296,
1410
+ "mean_token_accuracy": 0.9795438170433044,
1411
+ "num_tokens": 28352640.0,
1412
+ "step": 1380
1413
+ },
1414
+ {
1415
+ "entropy": 0.8776536166667939,
1416
+ "epoch": 2.559852670349908,
1417
+ "grad_norm": 0.8895741105079651,
1418
+ "learning_rate": 8.8868155962915e-05,
1419
+ "loss": 0.05282890796661377,
1420
+ "mean_token_accuracy": 0.9790538609027862,
1421
+ "num_tokens": 28558153.0,
1422
+ "step": 1390
1423
+ },
1424
+ {
1425
+ "entropy": 0.8738743245601654,
1426
+ "epoch": 2.578268876611418,
1427
+ "grad_norm": 0.788800060749054,
1428
+ "learning_rate": 8.867586723137906e-05,
1429
+ "loss": 0.048841872811317445,
1430
+ "mean_token_accuracy": 0.9809149026870727,
1431
+ "num_tokens": 28763613.0,
1432
+ "step": 1400
1433
+ },
1434
+ {
1435
+ "entropy": 0.8750253796577454,
1436
+ "epoch": 2.596685082872928,
1437
+ "grad_norm": 0.8738002777099609,
1438
+ "learning_rate": 8.848214375131497e-05,
1439
+ "loss": 0.048261132836341855,
1440
+ "mean_token_accuracy": 0.980789190530777,
1441
+ "num_tokens": 28969248.0,
1442
+ "step": 1410
1443
+ },
1444
+ {
1445
+ "entropy": 0.8624245524406433,
1446
+ "epoch": 2.6151012891344383,
1447
+ "grad_norm": 0.6404895186424255,
1448
+ "learning_rate": 8.828699270923196e-05,
1449
+ "loss": 0.04970468282699585,
1450
+ "mean_token_accuracy": 0.9807762265205383,
1451
+ "num_tokens": 29174779.0,
1452
+ "step": 1420
1453
+ },
1454
+ {
1455
+ "entropy": 0.8792938470840455,
1456
+ "epoch": 2.6335174953959486,
1457
+ "grad_norm": 0.7856965661048889,
1458
+ "learning_rate": 8.80904213445972e-05,
1459
+ "loss": 0.053334391117095946,
1460
+ "mean_token_accuracy": 0.9790222108364105,
1461
+ "num_tokens": 29380474.0,
1462
+ "step": 1430
1463
+ },
1464
+ {
1465
+ "entropy": 0.8831034600734711,
1466
+ "epoch": 2.6519337016574585,
1467
+ "grad_norm": 0.7739618420600891,
1468
+ "learning_rate": 8.789243694956716e-05,
1469
+ "loss": 0.04959054589271546,
1470
+ "mean_token_accuracy": 0.9803965091705322,
1471
+ "num_tokens": 29585985.0,
1472
+ "step": 1440
1473
+ },
1474
+ {
1475
+ "entropy": 0.8934672951698304,
1476
+ "epoch": 2.6703499079189688,
1477
+ "grad_norm": 0.6999697089195251,
1478
+ "learning_rate": 8.769304686871719e-05,
1479
+ "loss": 0.05165250301361084,
1480
+ "mean_token_accuracy": 0.9798884153366089,
1481
+ "num_tokens": 29791238.0,
1482
+ "step": 1450
1483
+ },
1484
+ {
1485
+ "entropy": 0.9053199410438537,
1486
+ "epoch": 2.6887661141804786,
1487
+ "grad_norm": 0.9199564456939697,
1488
+ "learning_rate": 8.749225849876892e-05,
1489
+ "loss": 0.04924143850803375,
1490
+ "mean_token_accuracy": 0.9810785710811615,
1491
+ "num_tokens": 29996589.0,
1492
+ "step": 1460
1493
+ },
1494
+ {
1495
+ "entropy": 0.888091403245926,
1496
+ "epoch": 2.707182320441989,
1497
+ "grad_norm": 0.7480106353759766,
1498
+ "learning_rate": 8.729007928831597e-05,
1499
+ "loss": 0.04948916733264923,
1500
+ "mean_token_accuracy": 0.9809579730033875,
1501
+ "num_tokens": 30201875.0,
1502
+ "step": 1470
1503
+ },
1504
+ {
1505
+ "entropy": 0.8723407983779907,
1506
+ "epoch": 2.7255985267034992,
1507
+ "grad_norm": 0.9506945013999939,
1508
+ "learning_rate": 8.708651673754763e-05,
1509
+ "loss": 0.048927539587020875,
1510
+ "mean_token_accuracy": 0.980553150177002,
1511
+ "num_tokens": 30407550.0,
1512
+ "step": 1480
1513
+ },
1514
+ {
1515
+ "entropy": 0.8737521529197693,
1516
+ "epoch": 2.744014732965009,
1517
+ "grad_norm": 0.8015706539154053,
1518
+ "learning_rate": 8.688157839797062e-05,
1519
+ "loss": 0.04963063597679138,
1520
+ "mean_token_accuracy": 0.9809738755226135,
1521
+ "num_tokens": 30612839.0,
1522
+ "step": 1490
1523
+ },
1524
+ {
1525
+ "entropy": 0.8800762951374054,
1526
+ "epoch": 2.7624309392265194,
1527
+ "grad_norm": 0.9429986476898193,
1528
+ "learning_rate": 8.667527187212885e-05,
1529
+ "loss": 0.0524174690246582,
1530
+ "mean_token_accuracy": 0.9788767337799072,
1531
+ "num_tokens": 30818578.0,
1532
+ "step": 1500
1533
+ },
1534
+ {
1535
+ "entropy": 0.8871055901050567,
1536
+ "epoch": 2.7808471454880292,
1537
+ "grad_norm": 0.5909196138381958,
1538
+ "learning_rate": 8.646760481332157e-05,
1539
+ "loss": 0.05166680812835693,
1540
+ "mean_token_accuracy": 0.980216771364212,
1541
+ "num_tokens": 31023829.0,
1542
+ "step": 1510
1543
+ },
1544
+ {
1545
+ "entropy": 0.8908755779266357,
1546
+ "epoch": 2.7992633517495396,
1547
+ "grad_norm": 0.9154611229896545,
1548
+ "learning_rate": 8.625858492531931e-05,
1549
+ "loss": 0.04951836466789246,
1550
+ "mean_token_accuracy": 0.9801484227180481,
1551
+ "num_tokens": 31229635.0,
1552
+ "step": 1520
1553
+ },
1554
+ {
1555
+ "entropy": 0.92480548620224,
1556
+ "epoch": 2.81767955801105,
1557
+ "grad_norm": 0.5989938378334045,
1558
+ "learning_rate": 8.604821996207819e-05,
1559
+ "loss": 0.04799881279468536,
1560
+ "mean_token_accuracy": 0.9817522585391998,
1561
+ "num_tokens": 31435456.0,
1562
+ "step": 1530
1563
+ },
1564
+ {
1565
+ "entropy": 0.9173881888389588,
1566
+ "epoch": 2.8360957642725597,
1567
+ "grad_norm": 0.899413526058197,
1568
+ "learning_rate": 8.58365177274522e-05,
1569
+ "loss": 0.0487445592880249,
1570
+ "mean_token_accuracy": 0.9812625288963318,
1571
+ "num_tokens": 31640904.0,
1572
+ "step": 1540
1573
+ },
1574
+ {
1575
+ "entropy": 0.9076135993003845,
1576
+ "epoch": 2.85451197053407,
1577
+ "grad_norm": 0.8494166135787964,
1578
+ "learning_rate": 8.562348607490376e-05,
1579
+ "loss": 0.05005228519439697,
1580
+ "mean_token_accuracy": 0.9806681036949157,
1581
+ "num_tokens": 31845807.0,
1582
+ "step": 1550
1583
+ },
1584
+ {
1585
+ "entropy": 0.9092245221138,
1586
+ "epoch": 2.87292817679558,
1587
+ "grad_norm": 0.8225123286247253,
1588
+ "learning_rate": 8.540913290721234e-05,
1589
+ "loss": 0.048654764890670776,
1590
+ "mean_token_accuracy": 0.9805659353733063,
1591
+ "num_tokens": 32051523.0,
1592
+ "step": 1560
1593
+ },
1594
+ {
1595
+ "entropy": 0.9062779664993286,
1596
+ "epoch": 2.89134438305709,
1597
+ "grad_norm": 0.7074014544487,
1598
+ "learning_rate": 8.519346617618134e-05,
1599
+ "loss": 0.049209845066070554,
1600
+ "mean_token_accuracy": 0.9807434439659118,
1601
+ "num_tokens": 32256895.0,
1602
+ "step": 1570
1603
+ },
1604
+ {
1605
+ "entropy": 0.9190246641635895,
1606
+ "epoch": 2.9097605893186005,
1607
+ "grad_norm": 0.8860642910003662,
1608
+ "learning_rate": 8.497649388234304e-05,
1609
+ "loss": 0.051211881637573245,
1610
+ "mean_token_accuracy": 0.9802342295646668,
1611
+ "num_tokens": 32462031.0,
1612
+ "step": 1580
1613
+ },
1614
+ {
1615
+ "entropy": 0.9088015079498291,
1616
+ "epoch": 2.9281767955801103,
1617
+ "grad_norm": 0.8062726855278015,
1618
+ "learning_rate": 8.475822407466188e-05,
1619
+ "loss": 0.053512704372406,
1620
+ "mean_token_accuracy": 0.979486483335495,
1621
+ "num_tokens": 32667533.0,
1622
+ "step": 1590
1623
+ },
1624
+ {
1625
+ "entropy": 0.9462027847766876,
1626
+ "epoch": 2.9465930018416207,
1627
+ "grad_norm": 0.7962909936904907,
1628
+ "learning_rate": 8.453866485023579e-05,
1629
+ "loss": 0.0501457154750824,
1630
+ "mean_token_accuracy": 0.9803222417831421,
1631
+ "num_tokens": 32872900.0,
1632
+ "step": 1600
1633
+ },
1634
+ {
1635
+ "entropy": 0.9671471297740937,
1636
+ "epoch": 2.9650092081031305,
1637
+ "grad_norm": 0.7641744017601013,
1638
+ "learning_rate": 8.431782435399587e-05,
1639
+ "loss": 0.04629061222076416,
1640
+ "mean_token_accuracy": 0.9823175370693207,
1641
+ "num_tokens": 33077850.0,
1642
+ "step": 1610
1643
+ },
1644
+ {
1645
+ "entropy": 0.955865204334259,
1646
+ "epoch": 2.983425414364641,
1647
+ "grad_norm": 0.6772348880767822,
1648
+ "learning_rate": 8.409571077840426e-05,
1649
+ "loss": 0.048368623852729796,
1650
+ "mean_token_accuracy": 0.9808700799942016,
1651
+ "num_tokens": 33283117.0,
1652
+ "step": 1620
1653
+ },
1654
+ {
1655
+ "epoch": 3.0,
1656
+ "eval_entropy": 0.9563225186389426,
1657
+ "eval_loss": 0.059064481407403946,
1658
+ "eval_mean_token_accuracy": 0.9773589429648026,
1659
+ "eval_num_tokens": 33467712.0,
1660
+ "eval_runtime": 10.1471,
1661
+ "eval_samples_per_second": 360.499,
1662
+ "eval_steps_per_second": 11.333,
1663
+ "step": 1629
1664
+ },
1665
+ {
1666
+ "entropy": 0.9337226033210755,
1667
+ "epoch": 3.001841620626151,
1668
+ "grad_norm": 0.646203875541687,
1669
+ "learning_rate": 8.387233236315016e-05,
1670
+ "loss": 0.043352216482162476,
1671
+ "mean_token_accuracy": 0.9830620110034942,
1672
+ "num_tokens": 33488302.0,
1673
+ "step": 1630
1674
+ },
1675
+ {
1676
+ "entropy": 0.9734923839569092,
1677
+ "epoch": 3.020257826887661,
1678
+ "grad_norm": 0.7564226984977722,
1679
+ "learning_rate": 8.364769739484416e-05,
1680
+ "loss": 0.033932483196258544,
1681
+ "mean_token_accuracy": 0.9872806966304779,
1682
+ "num_tokens": 33693531.0,
1683
+ "step": 1640
1684
+ },
1685
+ {
1686
+ "entropy": 0.9669206500053406,
1687
+ "epoch": 3.0386740331491713,
1688
+ "grad_norm": 0.7126886248588562,
1689
+ "learning_rate": 8.342181420671096e-05,
1690
+ "loss": 0.03818287253379822,
1691
+ "mean_token_accuracy": 0.9852082908153534,
1692
+ "num_tokens": 33899305.0,
1693
+ "step": 1650
1694
+ },
1695
+ {
1696
+ "entropy": 0.9522916138172149,
1697
+ "epoch": 3.0570902394106816,
1698
+ "grad_norm": 1.0571653842926025,
1699
+ "learning_rate": 8.319469117828007e-05,
1700
+ "loss": 0.03456039130687714,
1701
+ "mean_token_accuracy": 0.9867027878761292,
1702
+ "num_tokens": 34104585.0,
1703
+ "step": 1660
1704
+ },
1705
+ {
1706
+ "entropy": 0.9568560004234314,
1707
+ "epoch": 3.0755064456721914,
1708
+ "grad_norm": 0.780940592288971,
1709
+ "learning_rate": 8.296633673507505e-05,
1710
+ "loss": 0.03551802039146423,
1711
+ "mean_token_accuracy": 0.9867531359195709,
1712
+ "num_tokens": 34309516.0,
1713
+ "step": 1670
1714
+ },
1715
+ {
1716
+ "entropy": 0.9590656876564025,
1717
+ "epoch": 3.0939226519337018,
1718
+ "grad_norm": 0.8330219388008118,
1719
+ "learning_rate": 8.273675934830094e-05,
1720
+ "loss": 0.03674865961074829,
1721
+ "mean_token_accuracy": 0.9864118576049805,
1722
+ "num_tokens": 34515170.0,
1723
+ "step": 1680
1724
+ },
1725
+ {
1726
+ "entropy": 0.975881814956665,
1727
+ "epoch": 3.1123388581952116,
1728
+ "grad_norm": 0.7010637521743774,
1729
+ "learning_rate": 8.250596753453e-05,
1730
+ "loss": 0.03550414443016052,
1731
+ "mean_token_accuracy": 0.9864102602005005,
1732
+ "num_tokens": 34720896.0,
1733
+ "step": 1690
1734
+ },
1735
+ {
1736
+ "entropy": 0.9599562883377075,
1737
+ "epoch": 3.130755064456722,
1738
+ "grad_norm": 0.6694278717041016,
1739
+ "learning_rate": 8.227396985538578e-05,
1740
+ "loss": 0.035564273595809937,
1741
+ "mean_token_accuracy": 0.9867321848869324,
1742
+ "num_tokens": 34925970.0,
1743
+ "step": 1700
1744
+ },
1745
+ {
1746
+ "entropy": 0.9582216143608093,
1747
+ "epoch": 3.149171270718232,
1748
+ "grad_norm": 0.9333199262619019,
1749
+ "learning_rate": 8.204077491722546e-05,
1750
+ "loss": 0.035575729608535764,
1751
+ "mean_token_accuracy": 0.9862452208995819,
1752
+ "num_tokens": 35131543.0,
1753
+ "step": 1710
1754
+ },
1755
+ {
1756
+ "entropy": 0.9579678058624268,
1757
+ "epoch": 3.167587476979742,
1758
+ "grad_norm": 0.9450218081474304,
1759
+ "learning_rate": 8.180639137082066e-05,
1760
+ "loss": 0.0385298490524292,
1761
+ "mean_token_accuracy": 0.98538036942482,
1762
+ "num_tokens": 35336790.0,
1763
+ "step": 1720
1764
+ },
1765
+ {
1766
+ "entropy": 0.9640831351280212,
1767
+ "epoch": 3.1860036832412524,
1768
+ "grad_norm": 0.8551534414291382,
1769
+ "learning_rate": 8.157082791103649e-05,
1770
+ "loss": 0.03702138364315033,
1771
+ "mean_token_accuracy": 0.9852015495300293,
1772
+ "num_tokens": 35542294.0,
1773
+ "step": 1730
1774
+ },
1775
+ {
1776
+ "entropy": 0.9867071211338043,
1777
+ "epoch": 3.2044198895027622,
1778
+ "grad_norm": 0.7138128876686096,
1779
+ "learning_rate": 8.133409327650897e-05,
1780
+ "loss": 0.035626694560050964,
1781
+ "mean_token_accuracy": 0.986064875125885,
1782
+ "num_tokens": 35747447.0,
1783
+ "step": 1740
1784
+ },
1785
+ {
1786
+ "entropy": 0.9639089345932007,
1787
+ "epoch": 3.2228360957642725,
1788
+ "grad_norm": 0.7131415009498596,
1789
+ "learning_rate": 8.109619624932092e-05,
1790
+ "loss": 0.035885071754455565,
1791
+ "mean_token_accuracy": 0.986273056268692,
1792
+ "num_tokens": 35952258.0,
1793
+ "step": 1750
1794
+ },
1795
+ {
1796
+ "entropy": 0.9516046345233917,
1797
+ "epoch": 3.241252302025783,
1798
+ "grad_norm": 0.6900200843811035,
1799
+ "learning_rate": 8.085714565467611e-05,
1800
+ "loss": 0.03535219430923462,
1801
+ "mean_token_accuracy": 0.985836285352707,
1802
+ "num_tokens": 36157938.0,
1803
+ "step": 1760
1804
+ },
1805
+ {
1806
+ "entropy": 0.9373646557331086,
1807
+ "epoch": 3.2596685082872927,
1808
+ "grad_norm": 0.6101690530776978,
1809
+ "learning_rate": 8.061695036057191e-05,
1810
+ "loss": 0.034940996766090394,
1811
+ "mean_token_accuracy": 0.9863743901252746,
1812
+ "num_tokens": 36363825.0,
1813
+ "step": 1770
1814
+ },
1815
+ {
1816
+ "entropy": 0.9444344758987426,
1817
+ "epoch": 3.278084714548803,
1818
+ "grad_norm": 0.7518529295921326,
1819
+ "learning_rate": 8.03756192774703e-05,
1820
+ "loss": 0.03404279053211212,
1821
+ "mean_token_accuracy": 0.9866396844387054,
1822
+ "num_tokens": 36568961.0,
1823
+ "step": 1780
1824
+ },
1825
+ {
1826
+ "entropy": 0.9550357758998871,
1827
+ "epoch": 3.2965009208103133,
1828
+ "grad_norm": 0.7687555551528931,
1829
+ "learning_rate": 8.013316135796734e-05,
1830
+ "loss": 0.038447052240371704,
1831
+ "mean_token_accuracy": 0.985325163602829,
1832
+ "num_tokens": 36774514.0,
1833
+ "step": 1790
1834
+ },
1835
+ {
1836
+ "entropy": 0.9477231681346894,
1837
+ "epoch": 3.314917127071823,
1838
+ "grad_norm": 0.7521633505821228,
1839
+ "learning_rate": 7.988958559646102e-05,
1840
+ "loss": 0.03746694028377533,
1841
+ "mean_token_accuracy": 0.9853165090084076,
1842
+ "num_tokens": 36979660.0,
1843
+ "step": 1800
1844
+ },
1845
+ {
1846
+ "entropy": 0.925805002450943,
1847
+ "epoch": 3.3333333333333335,
1848
+ "grad_norm": 0.9333297610282898,
1849
+ "learning_rate": 7.964490102881768e-05,
1850
+ "loss": 0.03700103759765625,
1851
+ "mean_token_accuracy": 0.9850880861282348,
1852
+ "num_tokens": 37185191.0,
1853
+ "step": 1810
1854
+ },
1855
+ {
1856
+ "entropy": 0.9225482225418091,
1857
+ "epoch": 3.3517495395948433,
1858
+ "grad_norm": 0.7928622961044312,
1859
+ "learning_rate": 7.939911673203665e-05,
1860
+ "loss": 0.03825801610946655,
1861
+ "mean_token_accuracy": 0.9850241422653199,
1862
+ "num_tokens": 37390749.0,
1863
+ "step": 1820
1864
+ },
1865
+ {
1866
+ "entropy": 0.9597147881984711,
1867
+ "epoch": 3.3701657458563536,
1868
+ "grad_norm": 0.7658583521842957,
1869
+ "learning_rate": 7.915224182391375e-05,
1870
+ "loss": 0.039855146408081056,
1871
+ "mean_token_accuracy": 0.9845879554748536,
1872
+ "num_tokens": 37596052.0,
1873
+ "step": 1830
1874
+ },
1875
+ {
1876
+ "entropy": 0.9485619068145752,
1877
+ "epoch": 3.388581952117864,
1878
+ "grad_norm": 0.8492130637168884,
1879
+ "learning_rate": 7.890428546270278e-05,
1880
+ "loss": 0.039359599351882935,
1881
+ "mean_token_accuracy": 0.9847265422344208,
1882
+ "num_tokens": 37802063.0,
1883
+ "step": 1840
1884
+ },
1885
+ {
1886
+ "entropy": 0.9670301914215088,
1887
+ "epoch": 3.406998158379374,
1888
+ "grad_norm": 0.7527599930763245,
1889
+ "learning_rate": 7.865525684677608e-05,
1890
+ "loss": 0.03752985596656799,
1891
+ "mean_token_accuracy": 0.9855137526988983,
1892
+ "num_tokens": 38007432.0,
1893
+ "step": 1850
1894
+ },
1895
+ {
1896
+ "entropy": 0.9681244969367981,
1897
+ "epoch": 3.425414364640884,
1898
+ "grad_norm": 0.7599612474441528,
1899
+ "learning_rate": 7.840516521428303e-05,
1900
+ "loss": 0.03653894364833832,
1901
+ "mean_token_accuracy": 0.9858933389186859,
1902
+ "num_tokens": 38212923.0,
1903
+ "step": 1860
1904
+ },
1905
+ {
1906
+ "entropy": 0.9706049561500549,
1907
+ "epoch": 3.443830570902394,
1908
+ "grad_norm": 0.7678127884864807,
1909
+ "learning_rate": 7.815401984280748e-05,
1910
+ "loss": 0.0366938978433609,
1911
+ "mean_token_accuracy": 0.9854713797569274,
1912
+ "num_tokens": 38418422.0,
1913
+ "step": 1870
1914
+ },
1915
+ {
1916
+ "entropy": 0.9637093842029572,
1917
+ "epoch": 3.4622467771639043,
1918
+ "grad_norm": 0.762824535369873,
1919
+ "learning_rate": 7.790183004902359e-05,
1920
+ "loss": 0.03516915142536163,
1921
+ "mean_token_accuracy": 0.9866003453731537,
1922
+ "num_tokens": 38624389.0,
1923
+ "step": 1880
1924
+ },
1925
+ {
1926
+ "entropy": 0.9373565018177032,
1927
+ "epoch": 3.4806629834254146,
1928
+ "grad_norm": 0.8221780061721802,
1929
+ "learning_rate": 7.764860518835014e-05,
1930
+ "loss": 0.04049026966094971,
1931
+ "mean_token_accuracy": 0.984089481830597,
1932
+ "num_tokens": 38829654.0,
1933
+ "step": 1890
1934
+ },
1935
+ {
1936
+ "entropy": 0.9356025457382202,
1937
+ "epoch": 3.4990791896869244,
1938
+ "grad_norm": 0.7583426237106323,
1939
+ "learning_rate": 7.739435465460356e-05,
1940
+ "loss": 0.03658481240272522,
1941
+ "mean_token_accuracy": 0.9857318818569183,
1942
+ "num_tokens": 39034638.0,
1943
+ "step": 1900
1944
+ },
1945
+ {
1946
+ "entropy": 0.9740163326263428,
1947
+ "epoch": 3.5174953959484347,
1948
+ "grad_norm": 0.7332878112792969,
1949
+ "learning_rate": 7.713908787964937e-05,
1950
+ "loss": 0.03508963882923126,
1951
+ "mean_token_accuracy": 0.9863419532775879,
1952
+ "num_tokens": 39240265.0,
1953
+ "step": 1910
1954
+ },
1955
+ {
1956
+ "entropy": 0.9528286933898926,
1957
+ "epoch": 3.5359116022099446,
1958
+ "grad_norm": 0.6515451669692993,
1959
+ "learning_rate": 7.688281433305233e-05,
1960
+ "loss": 0.036055779457092284,
1961
+ "mean_token_accuracy": 0.9860979080200195,
1962
+ "num_tokens": 39445546.0,
1963
+ "step": 1920
1964
+ },
1965
+ {
1966
+ "entropy": 0.9480705261230469,
1967
+ "epoch": 3.554327808471455,
1968
+ "grad_norm": 0.7725827097892761,
1969
+ "learning_rate": 7.662554352172515e-05,
1970
+ "loss": 0.037101513147354125,
1971
+ "mean_token_accuracy": 0.985782790184021,
1972
+ "num_tokens": 39651078.0,
1973
+ "step": 1930
1974
+ },
1975
+ {
1976
+ "entropy": 0.9655321061611175,
1977
+ "epoch": 3.572744014732965,
1978
+ "grad_norm": 0.7756506204605103,
1979
+ "learning_rate": 7.636728498957581e-05,
1980
+ "loss": 0.03721855878829956,
1981
+ "mean_token_accuracy": 0.9857951939105988,
1982
+ "num_tokens": 39856542.0,
1983
+ "step": 1940
1984
+ },
1985
+ {
1986
+ "entropy": 0.9772682309150695,
1987
+ "epoch": 3.591160220994475,
1988
+ "grad_norm": 0.9084987640380859,
1989
+ "learning_rate": 7.610804831715355e-05,
1990
+ "loss": 0.03570749163627625,
1991
+ "mean_token_accuracy": 0.9863450109958649,
1992
+ "num_tokens": 40061913.0,
1993
+ "step": 1950
1994
+ },
1995
+ {
1996
+ "entropy": 0.9579685389995575,
1997
+ "epoch": 3.6095764272559854,
1998
+ "grad_norm": 0.6358487606048584,
1999
+ "learning_rate": 7.584784312129334e-05,
2000
+ "loss": 0.038210684061050416,
2001
+ "mean_token_accuracy": 0.9850837290287018,
2002
+ "num_tokens": 40267398.0,
2003
+ "step": 1960
2004
+ },
2005
+ {
2006
+ "entropy": 0.9605201721191406,
2007
+ "epoch": 3.6279926335174952,
2008
+ "grad_norm": 0.6263149976730347,
2009
+ "learning_rate": 7.558667905475927e-05,
2010
+ "loss": 0.03509160876274109,
2011
+ "mean_token_accuracy": 0.9868143379688263,
2012
+ "num_tokens": 40472827.0,
2013
+ "step": 1970
2014
+ },
2015
+ {
2016
+ "entropy": 0.964026153087616,
2017
+ "epoch": 3.6464088397790055,
2018
+ "grad_norm": 0.90068119764328,
2019
+ "learning_rate": 7.532456580588638e-05,
2020
+ "loss": 0.036211782693862916,
2021
+ "mean_token_accuracy": 0.9858468770980835,
2022
+ "num_tokens": 40677935.0,
2023
+ "step": 1980
2024
+ },
2025
+ {
2026
+ "entropy": 0.9494135618209839,
2027
+ "epoch": 3.664825046040516,
2028
+ "grad_norm": 0.760134756565094,
2029
+ "learning_rate": 7.50615130982213e-05,
2030
+ "loss": 0.03786201477050781,
2031
+ "mean_token_accuracy": 0.9852500438690186,
2032
+ "num_tokens": 40883750.0,
2033
+ "step": 1990
2034
+ },
2035
+ {
2036
+ "entropy": 0.9527071297168732,
2037
+ "epoch": 3.6832412523020257,
2038
+ "grad_norm": 0.9812107682228088,
2039
+ "learning_rate": 7.479753069016152e-05,
2040
+ "loss": 0.03803159594535828,
2041
+ "mean_token_accuracy": 0.9852405369281769,
2042
+ "num_tokens": 41089115.0,
2043
+ "step": 2000
2044
+ },
2045
+ {
2046
+ "entropy": 0.9639330863952636,
2047
+ "epoch": 3.701657458563536,
2048
+ "grad_norm": 0.7164933681488037,
2049
+ "learning_rate": 7.453262837459332e-05,
2050
+ "loss": 0.03912568986415863,
2051
+ "mean_token_accuracy": 0.9849458575248718,
2052
+ "num_tokens": 41294694.0,
2053
+ "step": 2010
2054
+ },
2055
+ {
2056
+ "entropy": 0.9536987483501435,
2057
+ "epoch": 3.720073664825046,
2058
+ "grad_norm": 0.6804596185684204,
2059
+ "learning_rate": 7.426681597852863e-05,
2060
+ "loss": 0.036410006880760196,
2061
+ "mean_token_accuracy": 0.985712206363678,
2062
+ "num_tokens": 41499817.0,
2063
+ "step": 2020
2064
+ },
2065
+ {
2066
+ "entropy": 0.9478164672851562,
2067
+ "epoch": 3.738489871086556,
2068
+ "grad_norm": 0.8799397349357605,
2069
+ "learning_rate": 7.400010336274037e-05,
2070
+ "loss": 0.03801035583019256,
2071
+ "mean_token_accuracy": 0.9850274682044983,
2072
+ "num_tokens": 41704932.0,
2073
+ "step": 2030
2074
+ },
2075
+ {
2076
+ "entropy": 0.9383447647094727,
2077
+ "epoch": 3.7569060773480665,
2078
+ "grad_norm": 0.8386216163635254,
2079
+ "learning_rate": 7.373250042139664e-05,
2080
+ "loss": 0.0373637855052948,
2081
+ "mean_token_accuracy": 0.9854822158813477,
2082
+ "num_tokens": 41910804.0,
2083
+ "step": 2040
2084
+ },
2085
+ {
2086
+ "entropy": 0.925172996520996,
2087
+ "epoch": 3.7753222836095763,
2088
+ "grad_norm": 0.7599324584007263,
2089
+ "learning_rate": 7.346401708169377e-05,
2090
+ "loss": 0.03585260808467865,
2091
+ "mean_token_accuracy": 0.9860672950744629,
2092
+ "num_tokens": 42116706.0,
2093
+ "step": 2050
2094
+ },
2095
+ {
2096
+ "entropy": 0.9463765442371368,
2097
+ "epoch": 3.7937384898710866,
2098
+ "grad_norm": 0.9030149579048157,
2099
+ "learning_rate": 7.319466330348797e-05,
2100
+ "loss": 0.035877206921577455,
2101
+ "mean_token_accuracy": 0.9863968968391419,
2102
+ "num_tokens": 42322670.0,
2103
+ "step": 2060
2104
+ },
2105
+ {
2106
+ "entropy": 0.9942441761493683,
2107
+ "epoch": 3.8121546961325965,
2108
+ "grad_norm": 0.6400449275970459,
2109
+ "learning_rate": 7.292444907892587e-05,
2110
+ "loss": 0.037310433387756345,
2111
+ "mean_token_accuracy": 0.9854151606559753,
2112
+ "num_tokens": 42527752.0,
2113
+ "step": 2070
2114
+ },
2115
+ {
2116
+ "entropy": 0.9577703952789307,
2117
+ "epoch": 3.830570902394107,
2118
+ "grad_norm": 0.6193167567253113,
2119
+ "learning_rate": 7.265338443207387e-05,
2120
+ "loss": 0.03648848831653595,
2121
+ "mean_token_accuracy": 0.9856530070304871,
2122
+ "num_tokens": 42732981.0,
2123
+ "step": 2080
2124
+ },
2125
+ {
2126
+ "entropy": 0.9663952767848969,
2127
+ "epoch": 3.848987108655617,
2128
+ "grad_norm": 0.759611189365387,
2129
+ "learning_rate": 7.238147941854625e-05,
2130
+ "loss": 0.036112996935844424,
2131
+ "mean_token_accuracy": 0.9862765550613404,
2132
+ "num_tokens": 42938619.0,
2133
+ "step": 2090
2134
+ },
2135
+ {
2136
+ "entropy": 0.9484863519668579,
2137
+ "epoch": 3.867403314917127,
2138
+ "grad_norm": 0.7420705556869507,
2139
+ "learning_rate": 7.210874412513218e-05,
2140
+ "loss": 0.03703283965587616,
2141
+ "mean_token_accuracy": 0.9857317566871643,
2142
+ "num_tokens": 43143753.0,
2143
+ "step": 2100
2144
+ },
2145
+ {
2146
+ "entropy": 0.964326673746109,
2147
+ "epoch": 3.8858195211786373,
2148
+ "grad_norm": 0.8779639601707458,
2149
+ "learning_rate": 7.183518866942147e-05,
2150
+ "loss": 0.03739701807498932,
2151
+ "mean_token_accuracy": 0.9852154791355133,
2152
+ "num_tokens": 43349451.0,
2153
+ "step": 2110
2154
+ },
2155
+ {
2156
+ "entropy": 0.9729791641235351,
2157
+ "epoch": 3.904235727440147,
2158
+ "grad_norm": 0.7582741379737854,
2159
+ "learning_rate": 7.156082319942929e-05,
2160
+ "loss": 0.03894525766372681,
2161
+ "mean_token_accuracy": 0.9847454309463501,
2162
+ "num_tokens": 43554598.0,
2163
+ "step": 2120
2164
+ },
2165
+ {
2166
+ "entropy": 0.9860592544078827,
2167
+ "epoch": 3.9226519337016574,
2168
+ "grad_norm": 0.860698938369751,
2169
+ "learning_rate": 7.128565789321969e-05,
2170
+ "loss": 0.0365300178527832,
2171
+ "mean_token_accuracy": 0.9859121859073638,
2172
+ "num_tokens": 43760081.0,
2173
+ "step": 2130
2174
+ },
2175
+ {
2176
+ "entropy": 0.9916551172733307,
2177
+ "epoch": 3.9410681399631677,
2178
+ "grad_norm": 0.8363776206970215,
2179
+ "learning_rate": 7.100970295852805e-05,
2180
+ "loss": 0.036221379041671754,
2181
+ "mean_token_accuracy": 0.9859034180641174,
2182
+ "num_tokens": 43965432.0,
2183
+ "step": 2140
2184
+ },
2185
+ {
2186
+ "entropy": 0.9553558886051178,
2187
+ "epoch": 3.9594843462246776,
2188
+ "grad_norm": 0.9627474546432495,
2189
+ "learning_rate": 7.073296863238242e-05,
2190
+ "loss": 0.03684481382369995,
2191
+ "mean_token_accuracy": 0.9857315957546234,
2192
+ "num_tokens": 44171232.0,
2193
+ "step": 2150
2194
+ },
2195
+ {
2196
+ "entropy": 0.9538035809993743,
2197
+ "epoch": 3.977900552486188,
2198
+ "grad_norm": 0.8399474620819092,
2199
+ "learning_rate": 7.045546518072366e-05,
2200
+ "loss": 0.03825397789478302,
2201
+ "mean_token_accuracy": 0.9846831560134888,
2202
+ "num_tokens": 44376723.0,
2203
+ "step": 2160
2204
+ },
2205
+ {
2206
+ "entropy": 0.9476235210895538,
2207
+ "epoch": 3.9963167587476978,
2208
+ "grad_norm": 0.708739697933197,
2209
+ "learning_rate": 7.017720289802472e-05,
2210
+ "loss": 0.03618018329143524,
2211
+ "mean_token_accuracy": 0.9861325800418854,
2212
+ "num_tokens": 44582407.0,
2213
+ "step": 2170
2214
+ },
2215
+ {
2216
+ "epoch": 4.0,
2217
+ "eval_entropy": 0.9569619194321011,
2218
+ "eval_loss": 0.059838198125362396,
2219
+ "eval_mean_token_accuracy": 0.9777795366618944,
2220
+ "eval_num_tokens": 44623647.0,
2221
+ "eval_runtime": 10.0379,
2222
+ "eval_samples_per_second": 364.42,
2223
+ "eval_steps_per_second": 11.457,
2224
+ "step": 2172
2225
+ }
2226
+ ],
2227
+ "logging_steps": 10,
2228
+ "max_steps": 5430,
2229
+ "num_input_tokens_seen": 0,
2230
+ "num_train_epochs": 10,
2231
+ "save_steps": 500,
2232
+ "stateful_callbacks": {
2233
+ "TrainerControl": {
2234
+ "args": {
2235
+ "should_epoch_stop": false,
2236
+ "should_evaluate": false,
2237
+ "should_log": false,
2238
+ "should_save": true,
2239
+ "should_training_stop": false
2240
+ },
2241
+ "attributes": {}
2242
+ }
2243
+ },
2244
+ "total_flos": 2.12729708313523e+18,
2245
+ "train_batch_size": 32,
2246
+ "trial_name": null,
2247
+ "trial_params": null
2248
+ }
checkpoint-2172/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75
3
+ size 5777
checkpoint-2715/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-2715/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-2715/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65a66fa5ef9ed41e342eac55fca7f83744379f75f4d29b573d2790ba504c1659
3
+ size 80792096
checkpoint-2715/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-2715/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-2715/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-2715/trainer_state.json ADDED
@@ -0,0 +1,2799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2715,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2237394809722901,
14
+ "epoch": 0.01841620626151013,
15
+ "grad_norm": 5.082435607910156,
16
+ "learning_rate": 3.308823529411765e-06,
17
+ "loss": 0.9237876892089844,
18
+ "mean_token_accuracy": 0.7685343027114868,
19
+ "num_tokens": 205423.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2295925617218018,
24
+ "epoch": 0.03683241252302026,
25
+ "grad_norm": 4.672000408172607,
26
+ "learning_rate": 6.985294117647059e-06,
27
+ "loss": 0.8900892257690429,
28
+ "mean_token_accuracy": 0.7677771031856537,
29
+ "num_tokens": 410849.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2285718679428101,
34
+ "epoch": 0.055248618784530384,
35
+ "grad_norm": 1.4828118085861206,
36
+ "learning_rate": 1.0661764705882354e-05,
37
+ "loss": 0.5975452899932862,
38
+ "mean_token_accuracy": 0.8146551787853241,
39
+ "num_tokens": 616438.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.210776400566101,
44
+ "epoch": 0.07366482504604052,
45
+ "grad_norm": 0.7761328816413879,
46
+ "learning_rate": 1.4338235294117647e-05,
47
+ "loss": 0.40664992332458494,
48
+ "mean_token_accuracy": 0.8699092030525207,
49
+ "num_tokens": 822118.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.200321125984192,
54
+ "epoch": 0.09208103130755065,
55
+ "grad_norm": 0.5363371968269348,
56
+ "learning_rate": 1.8014705882352943e-05,
57
+ "loss": 0.3313469409942627,
58
+ "mean_token_accuracy": 0.8904915869235992,
59
+ "num_tokens": 1027941.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.1809936046600342,
64
+ "epoch": 0.11049723756906077,
65
+ "grad_norm": 0.39541518688201904,
66
+ "learning_rate": 2.1691176470588237e-05,
67
+ "loss": 0.27568228244781495,
68
+ "mean_token_accuracy": 0.9047131836414337,
69
+ "num_tokens": 1233620.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.169810914993286,
74
+ "epoch": 0.1289134438305709,
75
+ "grad_norm": 0.341960072517395,
76
+ "learning_rate": 2.536764705882353e-05,
77
+ "loss": 0.245219087600708,
78
+ "mean_token_accuracy": 0.9150686681270599,
79
+ "num_tokens": 1438656.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.1652960777282715,
84
+ "epoch": 0.14732965009208104,
85
+ "grad_norm": 0.36872178316116333,
86
+ "learning_rate": 2.9044117647058828e-05,
87
+ "loss": 0.2220149040222168,
88
+ "mean_token_accuracy": 0.9224777698516846,
89
+ "num_tokens": 1643877.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.154341197013855,
94
+ "epoch": 0.16574585635359115,
95
+ "grad_norm": 0.4152425229549408,
96
+ "learning_rate": 3.272058823529412e-05,
97
+ "loss": 0.2002798557281494,
98
+ "mean_token_accuracy": 0.9285802960395813,
99
+ "num_tokens": 1849506.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.1507258892059327,
104
+ "epoch": 0.1841620626151013,
105
+ "grad_norm": 0.47647765278816223,
106
+ "learning_rate": 3.639705882352941e-05,
107
+ "loss": 0.18871363401412963,
108
+ "mean_token_accuracy": 0.9318056285381318,
109
+ "num_tokens": 2055071.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.1455535531044005,
114
+ "epoch": 0.20257826887661143,
115
+ "grad_norm": 0.4853009581565857,
116
+ "learning_rate": 4.007352941176471e-05,
117
+ "loss": 0.17836341857910157,
118
+ "mean_token_accuracy": 0.9367631554603577,
119
+ "num_tokens": 2260643.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.1402526497840881,
124
+ "epoch": 0.22099447513812154,
125
+ "grad_norm": 0.4455392360687256,
126
+ "learning_rate": 4.375e-05,
127
+ "loss": 0.16921783685684205,
128
+ "mean_token_accuracy": 0.9386959195137023,
129
+ "num_tokens": 2466085.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.1374777555465698,
134
+ "epoch": 0.23941068139963168,
135
+ "grad_norm": 0.5880279541015625,
136
+ "learning_rate": 4.742647058823529e-05,
137
+ "loss": 0.15989291667938232,
138
+ "mean_token_accuracy": 0.9421182632446289,
139
+ "num_tokens": 2671024.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1273940205574036,
144
+ "epoch": 0.2578268876611418,
145
+ "grad_norm": 0.612959086894989,
146
+ "learning_rate": 5.110294117647059e-05,
147
+ "loss": 0.14701461791992188,
148
+ "mean_token_accuracy": 0.9463540315628052,
149
+ "num_tokens": 2876848.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1263513088226318,
154
+ "epoch": 0.27624309392265195,
155
+ "grad_norm": 0.5695255398750305,
156
+ "learning_rate": 5.477941176470589e-05,
157
+ "loss": 0.14604382514953612,
158
+ "mean_token_accuracy": 0.946351945400238,
159
+ "num_tokens": 3082589.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1290789365768432,
164
+ "epoch": 0.2946593001841621,
165
+ "grad_norm": 0.6608090996742249,
166
+ "learning_rate": 5.845588235294118e-05,
167
+ "loss": 0.1409450054168701,
168
+ "mean_token_accuracy": 0.9481450319290161,
169
+ "num_tokens": 3287459.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1291529774665832,
174
+ "epoch": 0.31307550644567217,
175
+ "grad_norm": 0.652715802192688,
176
+ "learning_rate": 6.213235294117647e-05,
177
+ "loss": 0.14441155195236205,
178
+ "mean_token_accuracy": 0.9466125547885895,
179
+ "num_tokens": 3493682.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1244838953018188,
184
+ "epoch": 0.3314917127071823,
185
+ "grad_norm": 0.7815241813659668,
186
+ "learning_rate": 6.580882352941177e-05,
187
+ "loss": 0.13361064195632935,
188
+ "mean_token_accuracy": 0.9512295544147491,
189
+ "num_tokens": 3699573.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1217721104621887,
194
+ "epoch": 0.34990791896869244,
195
+ "grad_norm": 0.7933160066604614,
196
+ "learning_rate": 6.948529411764706e-05,
197
+ "loss": 0.13089522123336791,
198
+ "mean_token_accuracy": 0.9520221531391144,
199
+ "num_tokens": 3905156.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1206679105758668,
204
+ "epoch": 0.3683241252302026,
205
+ "grad_norm": 0.6815240383148193,
206
+ "learning_rate": 7.316176470588236e-05,
207
+ "loss": 0.13400404453277587,
208
+ "mean_token_accuracy": 0.9501322209835052,
209
+ "num_tokens": 4110570.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1161052227020263,
214
+ "epoch": 0.3867403314917127,
215
+ "grad_norm": 0.8297767639160156,
216
+ "learning_rate": 7.683823529411766e-05,
217
+ "loss": 0.13389937877655028,
218
+ "mean_token_accuracy": 0.9501932203769684,
219
+ "num_tokens": 4315834.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1098745942115784,
224
+ "epoch": 0.40515653775322286,
225
+ "grad_norm": 0.5943381786346436,
226
+ "learning_rate": 8.051470588235294e-05,
227
+ "loss": 0.13452907800674438,
228
+ "mean_token_accuracy": 0.9503286242485046,
229
+ "num_tokens": 4520807.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.100480353832245,
234
+ "epoch": 0.42357274401473294,
235
+ "grad_norm": 0.6094359755516052,
236
+ "learning_rate": 8.419117647058824e-05,
237
+ "loss": 0.12827746868133544,
238
+ "mean_token_accuracy": 0.952492094039917,
239
+ "num_tokens": 4725867.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.0901286959648133,
244
+ "epoch": 0.4419889502762431,
245
+ "grad_norm": 0.7240597605705261,
246
+ "learning_rate": 8.786764705882353e-05,
247
+ "loss": 0.12171242237091065,
248
+ "mean_token_accuracy": 0.953943532705307,
249
+ "num_tokens": 4931629.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.0885071873664856,
254
+ "epoch": 0.4604051565377532,
255
+ "grad_norm": 0.6939547657966614,
256
+ "learning_rate": 9.154411764705882e-05,
257
+ "loss": 0.12155698537826538,
258
+ "mean_token_accuracy": 0.9545870959758759,
259
+ "num_tokens": 5137285.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.086272156238556,
264
+ "epoch": 0.47882136279926335,
265
+ "grad_norm": 0.5752800703048706,
266
+ "learning_rate": 9.522058823529412e-05,
267
+ "loss": 0.12157790660858155,
268
+ "mean_token_accuracy": 0.9541126549243927,
269
+ "num_tokens": 5342575.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.0857678413391114,
274
+ "epoch": 0.4972375690607735,
275
+ "grad_norm": 0.7565123438835144,
276
+ "learning_rate": 9.889705882352942e-05,
277
+ "loss": 0.12349612712860107,
278
+ "mean_token_accuracy": 0.9535140514373779,
279
+ "num_tokens": 5547995.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.079762625694275,
284
+ "epoch": 0.5156537753222836,
285
+ "grad_norm": 0.6972768306732178,
286
+ "learning_rate": 9.999954556423843e-05,
287
+ "loss": 0.11875582933425903,
288
+ "mean_token_accuracy": 0.9556483089923858,
289
+ "num_tokens": 5753195.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.0742079138755798,
294
+ "epoch": 0.5340699815837937,
295
+ "grad_norm": 0.7821696996688843,
296
+ "learning_rate": 9.999731977631227e-05,
297
+ "loss": 0.11824090480804443,
298
+ "mean_token_accuracy": 0.9557521045207977,
299
+ "num_tokens": 5958236.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.0679773569107056,
304
+ "epoch": 0.5524861878453039,
305
+ "grad_norm": 0.5846888422966003,
306
+ "learning_rate": 9.999323925089486e-05,
307
+ "loss": 0.11707355976104736,
308
+ "mean_token_accuracy": 0.9554719448089599,
309
+ "num_tokens": 6163992.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.0655727863311768,
314
+ "epoch": 0.570902394106814,
315
+ "grad_norm": 0.5812502503395081,
316
+ "learning_rate": 9.998730413936037e-05,
317
+ "loss": 0.11371417045593261,
318
+ "mean_token_accuracy": 0.9576376020908356,
319
+ "num_tokens": 6369456.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.0607039332389832,
324
+ "epoch": 0.5893186003683242,
325
+ "grad_norm": 0.6238475441932678,
326
+ "learning_rate": 9.99795146618821e-05,
327
+ "loss": 0.11775733232498169,
328
+ "mean_token_accuracy": 0.9557221591472626,
329
+ "num_tokens": 6574833.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.0504255175590516,
334
+ "epoch": 0.6077348066298343,
335
+ "grad_norm": 0.6496815085411072,
336
+ "learning_rate": 9.996987110742422e-05,
337
+ "loss": 0.10904088020324706,
338
+ "mean_token_accuracy": 0.9585366368293762,
339
+ "num_tokens": 6780108.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.0456081986427308,
344
+ "epoch": 0.6261510128913443,
345
+ "grad_norm": 0.786702573299408,
346
+ "learning_rate": 9.995837383373119e-05,
347
+ "loss": 0.10642309188842773,
348
+ "mean_token_accuracy": 0.9596696078777314,
349
+ "num_tokens": 6985920.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.0455098271369934,
354
+ "epoch": 0.6445672191528545,
355
+ "grad_norm": 0.5473790168762207,
356
+ "learning_rate": 9.994502326731434e-05,
357
+ "loss": 0.10822961330413819,
358
+ "mean_token_accuracy": 0.959563136100769,
359
+ "num_tokens": 7191465.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.04240562915802,
364
+ "epoch": 0.6629834254143646,
365
+ "grad_norm": 0.6672356128692627,
366
+ "learning_rate": 9.992981990343614e-05,
367
+ "loss": 0.1110004186630249,
368
+ "mean_token_accuracy": 0.9582514643669129,
369
+ "num_tokens": 7396877.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0386811256408692,
374
+ "epoch": 0.6813996316758748,
375
+ "grad_norm": 0.698539674282074,
376
+ "learning_rate": 9.99127643060918e-05,
377
+ "loss": 0.107539963722229,
378
+ "mean_token_accuracy": 0.9593036234378814,
379
+ "num_tokens": 7602437.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.0311225533485413,
384
+ "epoch": 0.6998158379373849,
385
+ "grad_norm": 0.6629284024238586,
386
+ "learning_rate": 9.989385710798837e-05,
387
+ "loss": 0.1064023494720459,
388
+ "mean_token_accuracy": 0.9602205216884613,
389
+ "num_tokens": 7808142.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.030210506916046,
394
+ "epoch": 0.7182320441988951,
395
+ "grad_norm": 0.5616748929023743,
396
+ "learning_rate": 9.987309901052121e-05,
397
+ "loss": 0.10717041492462158,
398
+ "mean_token_accuracy": 0.9599347949028015,
399
+ "num_tokens": 8013407.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0208017826080322,
404
+ "epoch": 0.7366482504604052,
405
+ "grad_norm": 0.6329049468040466,
406
+ "learning_rate": 9.985049078374806e-05,
407
+ "loss": 0.10359601974487305,
408
+ "mean_token_accuracy": 0.9603756129741668,
409
+ "num_tokens": 8219040.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.015640377998352,
414
+ "epoch": 0.7550644567219152,
415
+ "grad_norm": 0.6516013741493225,
416
+ "learning_rate": 9.982603326636037e-05,
417
+ "loss": 0.10146439075469971,
418
+ "mean_token_accuracy": 0.9627702474594116,
419
+ "num_tokens": 8424678.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0105359435081482,
424
+ "epoch": 0.7734806629834254,
425
+ "grad_norm": 0.6920603513717651,
426
+ "learning_rate": 9.979972736565226e-05,
427
+ "loss": 0.10770498514175415,
428
+ "mean_token_accuracy": 0.9591470420360565,
429
+ "num_tokens": 8629868.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.9966452836990356,
434
+ "epoch": 0.7918968692449355,
435
+ "grad_norm": 0.6857476234436035,
436
+ "learning_rate": 9.977157405748687e-05,
437
+ "loss": 0.10282524824142455,
438
+ "mean_token_accuracy": 0.9612209022045135,
439
+ "num_tokens": 8835320.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.9945534646511078,
444
+ "epoch": 0.8103130755064457,
445
+ "grad_norm": 0.7208472490310669,
446
+ "learning_rate": 9.974157438626008e-05,
447
+ "loss": 0.10069938898086547,
448
+ "mean_token_accuracy": 0.9620070576667785,
449
+ "num_tokens": 9041123.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.979461395740509,
454
+ "epoch": 0.8287292817679558,
455
+ "grad_norm": 0.5071915984153748,
456
+ "learning_rate": 9.970972946486185e-05,
457
+ "loss": 0.09799174070358277,
458
+ "mean_token_accuracy": 0.9620374023914338,
459
+ "num_tokens": 9246361.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.9830998003482818,
464
+ "epoch": 0.8471454880294659,
465
+ "grad_norm": 0.8660802245140076,
466
+ "learning_rate": 9.967604047463493e-05,
467
+ "loss": 0.10378165245056152,
468
+ "mean_token_accuracy": 0.9606865763664245,
469
+ "num_tokens": 9451845.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.9813413023948669,
474
+ "epoch": 0.8655616942909761,
475
+ "grad_norm": 0.7642477750778198,
476
+ "learning_rate": 9.964050866533094e-05,
477
+ "loss": 0.1010061264038086,
478
+ "mean_token_accuracy": 0.9608745336532593,
479
+ "num_tokens": 9656802.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.967874163389206,
484
+ "epoch": 0.8839779005524862,
485
+ "grad_norm": 0.5987281799316406,
486
+ "learning_rate": 9.960313535506411e-05,
487
+ "loss": 0.10169394016265869,
488
+ "mean_token_accuracy": 0.9611998200416565,
489
+ "num_tokens": 9861719.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.9663491308689117,
494
+ "epoch": 0.9023941068139963,
495
+ "grad_norm": 0.6124638319015503,
496
+ "learning_rate": 9.956392193026239e-05,
497
+ "loss": 0.102389657497406,
498
+ "mean_token_accuracy": 0.9611884355545044,
499
+ "num_tokens": 10066673.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.959654438495636,
504
+ "epoch": 0.9208103130755064,
505
+ "grad_norm": 0.7873051762580872,
506
+ "learning_rate": 9.952286984561592e-05,
507
+ "loss": 0.10170392990112305,
508
+ "mean_token_accuracy": 0.9610928475856781,
509
+ "num_tokens": 10272091.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.9550537407398224,
514
+ "epoch": 0.9392265193370166,
515
+ "grad_norm": 0.6071968078613281,
516
+ "learning_rate": 9.947998062402313e-05,
517
+ "loss": 0.09448277950286865,
518
+ "mean_token_accuracy": 0.9648977637290954,
519
+ "num_tokens": 10477632.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.9538533687591553,
524
+ "epoch": 0.9576427255985267,
525
+ "grad_norm": 0.6317242980003357,
526
+ "learning_rate": 9.943525585653428e-05,
527
+ "loss": 0.09542192220687866,
528
+ "mean_token_accuracy": 0.9635261118412017,
529
+ "num_tokens": 10682828.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.9362513542175293,
534
+ "epoch": 0.9760589318600368,
535
+ "grad_norm": 0.6421944499015808,
536
+ "learning_rate": 9.938869720229234e-05,
537
+ "loss": 0.09382058382034301,
538
+ "mean_token_accuracy": 0.9648073971271515,
539
+ "num_tokens": 10888741.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.9235438346862793,
544
+ "epoch": 0.994475138121547,
545
+ "grad_norm": 0.7986873388290405,
546
+ "learning_rate": 9.934030638847155e-05,
547
+ "loss": 0.09827429056167603,
548
+ "mean_token_accuracy": 0.9621128737926483,
549
+ "num_tokens": 11094387.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "epoch": 1.0,
554
+ "eval_entropy": 0.9137652366057686,
555
+ "eval_loss": 0.09368764609098434,
556
+ "eval_mean_token_accuracy": 0.9640816880309063,
557
+ "eval_num_tokens": 11155908.0,
558
+ "eval_runtime": 10.4701,
559
+ "eval_samples_per_second": 349.377,
560
+ "eval_steps_per_second": 10.984,
561
+ "step": 543
562
+ },
563
+ {
564
+ "entropy": 0.9047818422317505,
565
+ "epoch": 1.0128913443830572,
566
+ "grad_norm": 0.6781501173973083,
567
+ "learning_rate": 9.929008521021325e-05,
568
+ "loss": 0.0863916516304016,
569
+ "mean_token_accuracy": 0.9673655688762665,
570
+ "num_tokens": 11299715.0,
571
+ "step": 550
572
+ },
573
+ {
574
+ "entropy": 0.8856981039047241,
575
+ "epoch": 1.0313075506445673,
576
+ "grad_norm": 0.7143136858940125,
577
+ "learning_rate": 9.923803553055937e-05,
578
+ "loss": 0.08632323145866394,
579
+ "mean_token_accuracy": 0.9677783191204071,
580
+ "num_tokens": 11505059.0,
581
+ "step": 560
582
+ },
583
+ {
584
+ "entropy": 0.8937099635601043,
585
+ "epoch": 1.0497237569060773,
586
+ "grad_norm": 0.7751694321632385,
587
+ "learning_rate": 9.918415928038325e-05,
588
+ "loss": 0.08178263902664185,
589
+ "mean_token_accuracy": 0.9694291114807129,
590
+ "num_tokens": 11710464.0,
591
+ "step": 570
592
+ },
593
+ {
594
+ "entropy": 0.8858704209327698,
595
+ "epoch": 1.0681399631675874,
596
+ "grad_norm": 0.7492292523384094,
597
+ "learning_rate": 9.912845845831805e-05,
598
+ "loss": 0.08074211478233337,
599
+ "mean_token_accuracy": 0.9692470014095307,
600
+ "num_tokens": 11915959.0,
601
+ "step": 580
602
+ },
603
+ {
604
+ "entropy": 0.8948039829730987,
605
+ "epoch": 1.0865561694290977,
606
+ "grad_norm": 0.8116479516029358,
607
+ "learning_rate": 9.907093513068259e-05,
608
+ "loss": 0.08712012171745301,
609
+ "mean_token_accuracy": 0.9669980227947235,
610
+ "num_tokens": 12121499.0,
611
+ "step": 590
612
+ },
613
+ {
614
+ "entropy": 0.8846789538860321,
615
+ "epoch": 1.1049723756906078,
616
+ "grad_norm": 0.7295626997947693,
617
+ "learning_rate": 9.901159143140471e-05,
618
+ "loss": 0.08444435596466064,
619
+ "mean_token_accuracy": 0.9674544095993042,
620
+ "num_tokens": 12327061.0,
621
+ "step": 600
622
+ },
623
+ {
624
+ "entropy": 0.8734103918075562,
625
+ "epoch": 1.1233885819521179,
626
+ "grad_norm": 0.9585768580436707,
627
+ "learning_rate": 9.89504295619421e-05,
628
+ "loss": 0.08022565841674804,
629
+ "mean_token_accuracy": 0.969569206237793,
630
+ "num_tokens": 12532305.0,
631
+ "step": 610
632
+ },
633
+ {
634
+ "entropy": 0.8640486001968384,
635
+ "epoch": 1.141804788213628,
636
+ "grad_norm": 0.7891159057617188,
637
+ "learning_rate": 9.88874517912006e-05,
638
+ "loss": 0.08415375947952271,
639
+ "mean_token_accuracy": 0.9678892493247986,
640
+ "num_tokens": 12737828.0,
641
+ "step": 620
642
+ },
643
+ {
644
+ "entropy": 0.8599755525588989,
645
+ "epoch": 1.160220994475138,
646
+ "grad_norm": 0.5801345109939575,
647
+ "learning_rate": 9.882266045545012e-05,
648
+ "loss": 0.08100489974021911,
649
+ "mean_token_accuracy": 0.9688023269176483,
650
+ "num_tokens": 12943343.0,
651
+ "step": 630
652
+ },
653
+ {
654
+ "entropy": 0.86524977684021,
655
+ "epoch": 1.1786372007366483,
656
+ "grad_norm": 0.7633041143417358,
657
+ "learning_rate": 9.87560579582379e-05,
658
+ "loss": 0.07859406471252442,
659
+ "mean_token_accuracy": 0.9702189445495606,
660
+ "num_tokens": 13148473.0,
661
+ "step": 640
662
+ },
663
+ {
664
+ "entropy": 0.8466695249080658,
665
+ "epoch": 1.1970534069981584,
666
+ "grad_norm": 0.8672215938568115,
667
+ "learning_rate": 9.868764677029934e-05,
668
+ "loss": 0.08082623481750488,
669
+ "mean_token_accuracy": 0.9689972400665283,
670
+ "num_tokens": 13353890.0,
671
+ "step": 650
672
+ },
673
+ {
674
+ "entropy": 0.8596941530704498,
675
+ "epoch": 1.2154696132596685,
676
+ "grad_norm": 0.7524124383926392,
677
+ "learning_rate": 9.861742942946639e-05,
678
+ "loss": 0.0789935290813446,
679
+ "mean_token_accuracy": 0.9693858206272126,
680
+ "num_tokens": 13559475.0,
681
+ "step": 660
682
+ },
683
+ {
684
+ "entropy": 0.8708749234676361,
685
+ "epoch": 1.2338858195211786,
686
+ "grad_norm": 0.5777031183242798,
687
+ "learning_rate": 9.854540854057337e-05,
688
+ "loss": 0.07773642539978028,
689
+ "mean_token_accuracy": 0.970385092496872,
690
+ "num_tokens": 13765076.0,
691
+ "step": 670
692
+ },
693
+ {
694
+ "entropy": 0.8651713371276856,
695
+ "epoch": 1.2523020257826887,
696
+ "grad_norm": 0.7924166321754456,
697
+ "learning_rate": 9.847158677536034e-05,
698
+ "loss": 0.0766686737537384,
699
+ "mean_token_accuracy": 0.9702267110347748,
700
+ "num_tokens": 13970642.0,
701
+ "step": 680
702
+ },
703
+ {
704
+ "entropy": 0.8763024985790253,
705
+ "epoch": 1.270718232044199,
706
+ "grad_norm": 0.741219162940979,
707
+ "learning_rate": 9.839596687237403e-05,
708
+ "loss": 0.07189929485321045,
709
+ "mean_token_accuracy": 0.9727097094058991,
710
+ "num_tokens": 14176556.0,
711
+ "step": 690
712
+ },
713
+ {
714
+ "entropy": 0.8556921362876893,
715
+ "epoch": 1.289134438305709,
716
+ "grad_norm": 0.6298198103904724,
717
+ "learning_rate": 9.831855163686618e-05,
718
+ "loss": 0.07608137726783752,
719
+ "mean_token_accuracy": 0.9716399371623993,
720
+ "num_tokens": 14381686.0,
721
+ "step": 700
722
+ },
723
+ {
724
+ "entropy": 0.869178420305252,
725
+ "epoch": 1.3075506445672191,
726
+ "grad_norm": 0.5850273370742798,
727
+ "learning_rate": 9.823934394068952e-05,
728
+ "loss": 0.07437651753425598,
729
+ "mean_token_accuracy": 0.9709566533565521,
730
+ "num_tokens": 14586814.0,
731
+ "step": 710
732
+ },
733
+ {
734
+ "entropy": 0.8708595156669616,
735
+ "epoch": 1.3259668508287292,
736
+ "grad_norm": 0.6580632328987122,
737
+ "learning_rate": 9.815834672219127e-05,
738
+ "loss": 0.07518917322158813,
739
+ "mean_token_accuracy": 0.9717426657676697,
740
+ "num_tokens": 14792321.0,
741
+ "step": 720
742
+ },
743
+ {
744
+ "entropy": 0.8826817810535431,
745
+ "epoch": 1.3443830570902393,
746
+ "grad_norm": 0.8788532018661499,
747
+ "learning_rate": 9.807556298610404e-05,
748
+ "loss": 0.07579240798950196,
749
+ "mean_token_accuracy": 0.9706341981887817,
750
+ "num_tokens": 14997810.0,
751
+ "step": 730
752
+ },
753
+ {
754
+ "entropy": 0.9012470185756684,
755
+ "epoch": 1.3627992633517496,
756
+ "grad_norm": 0.7022138237953186,
757
+ "learning_rate": 9.799099580343441e-05,
758
+ "loss": 0.0775588572025299,
759
+ "mean_token_accuracy": 0.9699241399765015,
760
+ "num_tokens": 15203795.0,
761
+ "step": 740
762
+ },
763
+ {
764
+ "entropy": 0.886955714225769,
765
+ "epoch": 1.3812154696132597,
766
+ "grad_norm": 0.7881133556365967,
767
+ "learning_rate": 9.790464831134903e-05,
768
+ "loss": 0.07125020027160645,
769
+ "mean_token_accuracy": 0.9723815560340882,
770
+ "num_tokens": 15408974.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9047374844551086,
775
+ "epoch": 1.3996316758747698,
776
+ "grad_norm": 0.9082005023956299,
777
+ "learning_rate": 9.781652371305824e-05,
778
+ "loss": 0.07004334926605224,
779
+ "mean_token_accuracy": 0.9725580036640167,
780
+ "num_tokens": 15614399.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9039053857326508,
785
+ "epoch": 1.4180478821362799,
786
+ "grad_norm": 0.8060817122459412,
787
+ "learning_rate": 9.77266252776972e-05,
788
+ "loss": 0.07103485465049744,
789
+ "mean_token_accuracy": 0.9721468150615692,
790
+ "num_tokens": 15819895.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.8998047232627868,
795
+ "epoch": 1.43646408839779,
796
+ "grad_norm": 1.0152642726898193,
797
+ "learning_rate": 9.763495634020467e-05,
798
+ "loss": 0.07411704063415528,
799
+ "mean_token_accuracy": 0.9711063146591187,
800
+ "num_tokens": 16025297.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.9120213568210602,
805
+ "epoch": 1.4548802946593002,
806
+ "grad_norm": 0.6288319826126099,
807
+ "learning_rate": 9.754152030119921e-05,
808
+ "loss": 0.07223712205886841,
809
+ "mean_token_accuracy": 0.9722476422786712,
810
+ "num_tokens": 16230656.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.9142370820045471,
815
+ "epoch": 1.4732965009208103,
816
+ "grad_norm": 0.7854700088500977,
817
+ "learning_rate": 9.744632062685311e-05,
818
+ "loss": 0.07186744809150696,
819
+ "mean_token_accuracy": 0.972247713804245,
820
+ "num_tokens": 16435943.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.8920814216136932,
825
+ "epoch": 1.4917127071823204,
826
+ "grad_norm": 0.6227074265480042,
827
+ "learning_rate": 9.734936084876383e-05,
828
+ "loss": 0.07016961574554444,
829
+ "mean_token_accuracy": 0.9725603640079499,
830
+ "num_tokens": 16641635.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.891328877210617,
835
+ "epoch": 1.5101289134438307,
836
+ "grad_norm": 0.7601346969604492,
837
+ "learning_rate": 9.725064456382283e-05,
838
+ "loss": 0.07137494087219239,
839
+ "mean_token_accuracy": 0.9722997546195984,
840
+ "num_tokens": 16847194.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.8921217978000641,
845
+ "epoch": 1.5285451197053406,
846
+ "grad_norm": 0.7813850045204163,
847
+ "learning_rate": 9.715017543408233e-05,
848
+ "loss": 0.06890199184417725,
849
+ "mean_token_accuracy": 0.9735044002532959,
850
+ "num_tokens": 17052807.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.9085914671421051,
855
+ "epoch": 1.5469613259668509,
856
+ "grad_norm": 0.6184289455413818,
857
+ "learning_rate": 9.704795718661939e-05,
858
+ "loss": 0.07043765187263488,
859
+ "mean_token_accuracy": 0.9725716531276702,
860
+ "num_tokens": 17258284.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9029861629009247,
865
+ "epoch": 1.565377532228361,
866
+ "grad_norm": 0.7082377076148987,
867
+ "learning_rate": 9.694399361339752e-05,
868
+ "loss": 0.07113839387893676,
869
+ "mean_token_accuracy": 0.9725669205188752,
870
+ "num_tokens": 17464326.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.8856533527374267,
875
+ "epoch": 1.583793738489871,
876
+ "grad_norm": 0.7409216165542603,
877
+ "learning_rate": 9.683828857112627e-05,
878
+ "loss": 0.07077333331108093,
879
+ "mean_token_accuracy": 0.9731084644794464,
880
+ "num_tokens": 17669537.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.8613030433654785,
885
+ "epoch": 1.6022099447513813,
886
+ "grad_norm": 0.6801561713218689,
887
+ "learning_rate": 9.673084598111789e-05,
888
+ "loss": 0.06885308027267456,
889
+ "mean_token_accuracy": 0.97266526222229,
890
+ "num_tokens": 17875289.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.8692965865135193,
895
+ "epoch": 1.6206261510128912,
896
+ "grad_norm": 1.1621277332305908,
897
+ "learning_rate": 9.662166982914203e-05,
898
+ "loss": 0.07017780542373657,
899
+ "mean_token_accuracy": 0.9733059942722321,
900
+ "num_tokens": 18080404.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.8671502113342285,
905
+ "epoch": 1.6390423572744015,
906
+ "grad_norm": 0.7518903613090515,
907
+ "learning_rate": 9.651076416527787e-05,
908
+ "loss": 0.06977018713951111,
909
+ "mean_token_accuracy": 0.9730017304420471,
910
+ "num_tokens": 18285699.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.8662045657634735,
915
+ "epoch": 1.6574585635359116,
916
+ "grad_norm": 0.6622698903083801,
917
+ "learning_rate": 9.639813310376378e-05,
918
+ "loss": 0.06620995998382569,
919
+ "mean_token_accuracy": 0.9737491130828857,
920
+ "num_tokens": 18491097.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.8548173069953918,
925
+ "epoch": 1.6758747697974217,
926
+ "grad_norm": 0.8941843509674072,
927
+ "learning_rate": 9.628378082284479e-05,
928
+ "loss": 0.06711119413375854,
929
+ "mean_token_accuracy": 0.9740589797496796,
930
+ "num_tokens": 18696827.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.8763562262058258,
935
+ "epoch": 1.694290976058932,
936
+ "grad_norm": 0.7571700215339661,
937
+ "learning_rate": 9.616771156461755e-05,
938
+ "loss": 0.07263468503952027,
939
+ "mean_token_accuracy": 0.9717419981956482,
940
+ "num_tokens": 18902513.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.8663733780384064,
945
+ "epoch": 1.7127071823204418,
946
+ "grad_norm": 0.7886489629745483,
947
+ "learning_rate": 9.604992963487298e-05,
948
+ "loss": 0.07074605226516724,
949
+ "mean_token_accuracy": 0.9724965393543243,
950
+ "num_tokens": 19107812.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.8673004627227783,
955
+ "epoch": 1.7311233885819521,
956
+ "grad_norm": 0.8180726170539856,
957
+ "learning_rate": 9.593043940293647e-05,
958
+ "loss": 0.06831735372543335,
959
+ "mean_token_accuracy": 0.9733696818351746,
960
+ "num_tokens": 19313330.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.8525971233844757,
965
+ "epoch": 1.7495395948434622,
966
+ "grad_norm": 0.6576228737831116,
967
+ "learning_rate": 9.580924530150595e-05,
968
+ "loss": 0.06567002534866333,
969
+ "mean_token_accuracy": 0.9745754361152649,
970
+ "num_tokens": 19518671.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.8605451703071594,
975
+ "epoch": 1.7679558011049723,
976
+ "grad_norm": 0.7171661257743835,
977
+ "learning_rate": 9.568635182648725e-05,
978
+ "loss": 0.06872050762176514,
979
+ "mean_token_accuracy": 0.9732091546058654,
980
+ "num_tokens": 19724135.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.8642210960388184,
985
+ "epoch": 1.7863720073664826,
986
+ "grad_norm": 0.7603147029876709,
987
+ "learning_rate": 9.556176353682746e-05,
988
+ "loss": 0.06766576766967773,
989
+ "mean_token_accuracy": 0.9728681743144989,
990
+ "num_tokens": 19928785.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.8543185651302337,
995
+ "epoch": 1.8047882136279927,
996
+ "grad_norm": 0.7280875444412231,
997
+ "learning_rate": 9.543548505434581e-05,
998
+ "loss": 0.06851862668991089,
999
+ "mean_token_accuracy": 0.9737437188625335,
1000
+ "num_tokens": 20134195.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.8744745373725891,
1005
+ "epoch": 1.8232044198895028,
1006
+ "grad_norm": 0.5897248983383179,
1007
+ "learning_rate": 9.530752106356209e-05,
1008
+ "loss": 0.06809053421020508,
1009
+ "mean_token_accuracy": 0.9733593761920929,
1010
+ "num_tokens": 20339517.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.8623859465122223,
1015
+ "epoch": 1.8416206261510129,
1016
+ "grad_norm": 0.7515265345573425,
1017
+ "learning_rate": 9.517787631152298e-05,
1018
+ "loss": 0.07257847785949707,
1019
+ "mean_token_accuracy": 0.9714054942131043,
1020
+ "num_tokens": 20545249.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.8669404804706573,
1025
+ "epoch": 1.860036832412523,
1026
+ "grad_norm": 0.7144560813903809,
1027
+ "learning_rate": 9.504655560762596e-05,
1028
+ "loss": 0.06832354068756104,
1029
+ "mean_token_accuracy": 0.9735779523849487,
1030
+ "num_tokens": 20750507.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.8493516445159912,
1035
+ "epoch": 1.8784530386740332,
1036
+ "grad_norm": 0.6559189558029175,
1037
+ "learning_rate": 9.491356382344081e-05,
1038
+ "loss": 0.0629766047000885,
1039
+ "mean_token_accuracy": 0.9754977762699127,
1040
+ "num_tokens": 20955956.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.8599376022815705,
1045
+ "epoch": 1.8968692449355433,
1046
+ "grad_norm": 0.6792973279953003,
1047
+ "learning_rate": 9.477890589252895e-05,
1048
+ "loss": 0.0666757881641388,
1049
+ "mean_token_accuracy": 0.974083811044693,
1050
+ "num_tokens": 21161163.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.8458438158035279,
1055
+ "epoch": 1.9152854511970534,
1056
+ "grad_norm": 0.6941778659820557,
1057
+ "learning_rate": 9.464258681026042e-05,
1058
+ "loss": 0.06307152509689332,
1059
+ "mean_token_accuracy": 0.9757042229175568,
1060
+ "num_tokens": 21366525.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.848515909910202,
1065
+ "epoch": 1.9337016574585635,
1066
+ "grad_norm": 0.7307806611061096,
1067
+ "learning_rate": 9.450461163362855e-05,
1068
+ "loss": 0.06307026147842407,
1069
+ "mean_token_accuracy": 0.9750974595546722,
1070
+ "num_tokens": 21572238.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.8563454031944275,
1075
+ "epoch": 1.9521178637200736,
1076
+ "grad_norm": 0.7222106456756592,
1077
+ "learning_rate": 9.436498548106236e-05,
1078
+ "loss": 0.0647726058959961,
1079
+ "mean_token_accuracy": 0.974629694223404,
1080
+ "num_tokens": 21777633.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.8656457483768463,
1085
+ "epoch": 1.9705340699815839,
1086
+ "grad_norm": 0.67178875207901,
1087
+ "learning_rate": 9.422371353223674e-05,
1088
+ "loss": 0.06573554277420043,
1089
+ "mean_token_accuracy": 0.9745908617973328,
1090
+ "num_tokens": 21983116.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.8630891263484954,
1095
+ "epoch": 1.988950276243094,
1096
+ "grad_norm": 0.6956593990325928,
1097
+ "learning_rate": 9.408080102788016e-05,
1098
+ "loss": 0.06630704402923585,
1099
+ "mean_token_accuracy": 0.9741333484649658,
1100
+ "num_tokens": 22188662.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "epoch": 2.0,
1105
+ "eval_entropy": 0.8560857042022373,
1106
+ "eval_loss": 0.06494329869747162,
1107
+ "eval_mean_token_accuracy": 0.9745692672936813,
1108
+ "eval_num_tokens": 22311800.0,
1109
+ "eval_runtime": 10.129,
1110
+ "eval_samples_per_second": 361.142,
1111
+ "eval_steps_per_second": 11.354,
1112
+ "step": 1086
1113
+ },
1114
+ {
1115
+ "entropy": 0.8616272270679474,
1116
+ "epoch": 2.007366482504604,
1117
+ "grad_norm": 0.7778105139732361,
1118
+ "learning_rate": 9.393625326958041e-05,
1119
+ "loss": 0.054407155513763426,
1120
+ "mean_token_accuracy": 0.9792074799537659,
1121
+ "num_tokens": 22394215.0,
1122
+ "step": 1090
1123
+ },
1124
+ {
1125
+ "entropy": 0.8496910452842712,
1126
+ "epoch": 2.0257826887661143,
1127
+ "grad_norm": 0.7422528266906738,
1128
+ "learning_rate": 9.379007561958792e-05,
1129
+ "loss": 0.051881587505340575,
1130
+ "mean_token_accuracy": 0.9799090325832367,
1131
+ "num_tokens": 22599599.0,
1132
+ "step": 1100
1133
+ },
1134
+ {
1135
+ "entropy": 0.8531602442264556,
1136
+ "epoch": 2.044198895027624,
1137
+ "grad_norm": 0.9075332880020142,
1138
+ "learning_rate": 9.36422735006167e-05,
1139
+ "loss": 0.05190724730491638,
1140
+ "mean_token_accuracy": 0.979931116104126,
1141
+ "num_tokens": 22805318.0,
1142
+ "step": 1110
1143
+ },
1144
+ {
1145
+ "entropy": 0.8657277703285218,
1146
+ "epoch": 2.0626151012891345,
1147
+ "grad_norm": 0.9466913938522339,
1148
+ "learning_rate": 9.349285239564325e-05,
1149
+ "loss": 0.053853434324264524,
1150
+ "mean_token_accuracy": 0.9796103596687317,
1151
+ "num_tokens": 23010438.0,
1152
+ "step": 1120
1153
+ },
1154
+ {
1155
+ "entropy": 0.8578485429286957,
1156
+ "epoch": 2.0810313075506444,
1157
+ "grad_norm": 0.6903054714202881,
1158
+ "learning_rate": 9.334181784770326e-05,
1159
+ "loss": 0.05228850841522217,
1160
+ "mean_token_accuracy": 0.9802409887313843,
1161
+ "num_tokens": 23215795.0,
1162
+ "step": 1130
1163
+ },
1164
+ {
1165
+ "entropy": 0.8450767934322357,
1166
+ "epoch": 2.0994475138121547,
1167
+ "grad_norm": 0.6615211367607117,
1168
+ "learning_rate": 9.318917545968581e-05,
1169
+ "loss": 0.050570905208587646,
1170
+ "mean_token_accuracy": 0.9802053451538086,
1171
+ "num_tokens": 23421157.0,
1172
+ "step": 1140
1173
+ },
1174
+ {
1175
+ "entropy": 0.8325044393539429,
1176
+ "epoch": 2.117863720073665,
1177
+ "grad_norm": 0.760960578918457,
1178
+ "learning_rate": 9.303493089412564e-05,
1179
+ "loss": 0.051966112852096555,
1180
+ "mean_token_accuracy": 0.9796205997467041,
1181
+ "num_tokens": 23626584.0,
1182
+ "step": 1150
1183
+ },
1184
+ {
1185
+ "entropy": 0.8416404843330383,
1186
+ "epoch": 2.136279926335175,
1187
+ "grad_norm": 0.6947009563446045,
1188
+ "learning_rate": 9.287908987299306e-05,
1189
+ "loss": 0.05144861936569214,
1190
+ "mean_token_accuracy": 0.9800034642219544,
1191
+ "num_tokens": 23832137.0,
1192
+ "step": 1160
1193
+ },
1194
+ {
1195
+ "entropy": 0.8564540028572083,
1196
+ "epoch": 2.154696132596685,
1197
+ "grad_norm": 0.733252763748169,
1198
+ "learning_rate": 9.272165817748164e-05,
1199
+ "loss": 0.04944799542427063,
1200
+ "mean_token_accuracy": 0.9808157980442047,
1201
+ "num_tokens": 24038006.0,
1202
+ "step": 1170
1203
+ },
1204
+ {
1205
+ "entropy": 0.8575525343418121,
1206
+ "epoch": 2.1731123388581954,
1207
+ "grad_norm": 0.8911028504371643,
1208
+ "learning_rate": 9.25626416477938e-05,
1209
+ "loss": 0.05037952661514282,
1210
+ "mean_token_accuracy": 0.980946284532547,
1211
+ "num_tokens": 24243374.0,
1212
+ "step": 1180
1213
+ },
1214
+ {
1215
+ "entropy": 0.8599720418453216,
1216
+ "epoch": 2.1915285451197053,
1217
+ "grad_norm": 0.7713524103164673,
1218
+ "learning_rate": 9.240204618292416e-05,
1219
+ "loss": 0.050603735446929934,
1220
+ "mean_token_accuracy": 0.980896121263504,
1221
+ "num_tokens": 24448585.0,
1222
+ "step": 1190
1223
+ },
1224
+ {
1225
+ "entropy": 0.8566664934158326,
1226
+ "epoch": 2.2099447513812156,
1227
+ "grad_norm": 0.8439353704452515,
1228
+ "learning_rate": 9.223987774044066e-05,
1229
+ "loss": 0.054171699285507205,
1230
+ "mean_token_accuracy": 0.9796543836593627,
1231
+ "num_tokens": 24653863.0,
1232
+ "step": 1200
1233
+ },
1234
+ {
1235
+ "entropy": 0.846601277589798,
1236
+ "epoch": 2.2283609576427255,
1237
+ "grad_norm": 0.7025637030601501,
1238
+ "learning_rate": 9.207614233626356e-05,
1239
+ "loss": 0.048924127221107484,
1240
+ "mean_token_accuracy": 0.9809681415557862,
1241
+ "num_tokens": 24859801.0,
1242
+ "step": 1210
1243
+ },
1244
+ {
1245
+ "entropy": 0.8564423739910125,
1246
+ "epoch": 2.2467771639042358,
1247
+ "grad_norm": 0.7788274884223938,
1248
+ "learning_rate": 9.191084604444233e-05,
1249
+ "loss": 0.05260283350944519,
1250
+ "mean_token_accuracy": 0.9793797850608825,
1251
+ "num_tokens": 25065368.0,
1252
+ "step": 1220
1253
+ },
1254
+ {
1255
+ "entropy": 0.865056723356247,
1256
+ "epoch": 2.265193370165746,
1257
+ "grad_norm": 0.8728818297386169,
1258
+ "learning_rate": 9.174399499693027e-05,
1259
+ "loss": 0.05016371011734009,
1260
+ "mean_token_accuracy": 0.9807134211063385,
1261
+ "num_tokens": 25270945.0,
1262
+ "step": 1230
1263
+ },
1264
+ {
1265
+ "entropy": 0.8642262935638427,
1266
+ "epoch": 2.283609576427256,
1267
+ "grad_norm": 1.0582489967346191,
1268
+ "learning_rate": 9.157559538335703e-05,
1269
+ "loss": 0.05316779017448425,
1270
+ "mean_token_accuracy": 0.9794209063053131,
1271
+ "num_tokens": 25476575.0,
1272
+ "step": 1240
1273
+ },
1274
+ {
1275
+ "entropy": 0.8677761554718018,
1276
+ "epoch": 2.3020257826887662,
1277
+ "grad_norm": 0.760109543800354,
1278
+ "learning_rate": 9.140565345079901e-05,
1279
+ "loss": 0.05115479230880737,
1280
+ "mean_token_accuracy": 0.9802310705184937,
1281
+ "num_tokens": 25682814.0,
1282
+ "step": 1250
1283
+ },
1284
+ {
1285
+ "entropy": 0.8592945456504821,
1286
+ "epoch": 2.320441988950276,
1287
+ "grad_norm": 0.6537907123565674,
1288
+ "learning_rate": 9.123417550354761e-05,
1289
+ "loss": 0.050543540716171266,
1290
+ "mean_token_accuracy": 0.9806945025920868,
1291
+ "num_tokens": 25887575.0,
1292
+ "step": 1260
1293
+ },
1294
+ {
1295
+ "entropy": 0.8692500293254852,
1296
+ "epoch": 2.3388581952117864,
1297
+ "grad_norm": 0.7771905064582825,
1298
+ "learning_rate": 9.106116790287541e-05,
1299
+ "loss": 0.049718713760375975,
1300
+ "mean_token_accuracy": 0.9805168390274048,
1301
+ "num_tokens": 26092950.0,
1302
+ "step": 1270
1303
+ },
1304
+ {
1305
+ "entropy": 0.8841261565685272,
1306
+ "epoch": 2.3572744014732967,
1307
+ "grad_norm": 0.7791076898574829,
1308
+ "learning_rate": 9.08866370668001e-05,
1309
+ "loss": 0.0527400553226471,
1310
+ "mean_token_accuracy": 0.9796754539012908,
1311
+ "num_tokens": 26298182.0,
1312
+ "step": 1280
1313
+ },
1314
+ {
1315
+ "entropy": 0.8675022900104523,
1316
+ "epoch": 2.3756906077348066,
1317
+ "grad_norm": 0.8481605648994446,
1318
+ "learning_rate": 9.07105894698464e-05,
1319
+ "loss": 0.05320838689804077,
1320
+ "mean_token_accuracy": 0.9792274832725525,
1321
+ "num_tokens": 26503425.0,
1322
+ "step": 1290
1323
+ },
1324
+ {
1325
+ "entropy": 0.8704026222229004,
1326
+ "epoch": 2.394106813996317,
1327
+ "grad_norm": 0.8235505819320679,
1328
+ "learning_rate": 9.053303164280602e-05,
1329
+ "loss": 0.055045205354690555,
1330
+ "mean_token_accuracy": 0.9788750648498535,
1331
+ "num_tokens": 26708755.0,
1332
+ "step": 1300
1333
+ },
1334
+ {
1335
+ "entropy": 0.8525134027004242,
1336
+ "epoch": 2.4125230202578267,
1337
+ "grad_norm": 0.7611598968505859,
1338
+ "learning_rate": 9.035397017249518e-05,
1339
+ "loss": 0.05029621124267578,
1340
+ "mean_token_accuracy": 0.9802757322788238,
1341
+ "num_tokens": 26914704.0,
1342
+ "step": 1310
1343
+ },
1344
+ {
1345
+ "entropy": 0.8630305290222168,
1346
+ "epoch": 2.430939226519337,
1347
+ "grad_norm": 0.790408194065094,
1348
+ "learning_rate": 9.017341170151041e-05,
1349
+ "loss": 0.04856040775775909,
1350
+ "mean_token_accuracy": 0.9809690833091735,
1351
+ "num_tokens": 27120151.0,
1352
+ "step": 1320
1353
+ },
1354
+ {
1355
+ "entropy": 0.8579159140586853,
1356
+ "epoch": 2.4493554327808473,
1357
+ "grad_norm": 0.781972348690033,
1358
+ "learning_rate": 8.999136292798207e-05,
1359
+ "loss": 0.04869682788848877,
1360
+ "mean_token_accuracy": 0.9816130697727203,
1361
+ "num_tokens": 27325673.0,
1362
+ "step": 1330
1363
+ },
1364
+ {
1365
+ "entropy": 0.8634716987609863,
1366
+ "epoch": 2.467771639042357,
1367
+ "grad_norm": 0.8500784039497375,
1368
+ "learning_rate": 8.980783060532588e-05,
1369
+ "loss": 0.05050289034843445,
1370
+ "mean_token_accuracy": 0.980079609155655,
1371
+ "num_tokens": 27531270.0,
1372
+ "step": 1340
1373
+ },
1374
+ {
1375
+ "entropy": 0.8660618126392364,
1376
+ "epoch": 2.4861878453038675,
1377
+ "grad_norm": 0.719760537147522,
1378
+ "learning_rate": 8.96228215419924e-05,
1379
+ "loss": 0.04892141819000244,
1380
+ "mean_token_accuracy": 0.9814020991325378,
1381
+ "num_tokens": 27736542.0,
1382
+ "step": 1350
1383
+ },
1384
+ {
1385
+ "entropy": 0.8572284400463104,
1386
+ "epoch": 2.5046040515653774,
1387
+ "grad_norm": 1.0197229385375977,
1388
+ "learning_rate": 8.943634260121442e-05,
1389
+ "loss": 0.05104702711105347,
1390
+ "mean_token_accuracy": 0.9798846662044525,
1391
+ "num_tokens": 27941566.0,
1392
+ "step": 1360
1393
+ },
1394
+ {
1395
+ "entropy": 0.8702241241931915,
1396
+ "epoch": 2.5230202578268877,
1397
+ "grad_norm": 0.7136003375053406,
1398
+ "learning_rate": 8.924840070075247e-05,
1399
+ "loss": 0.04855787754058838,
1400
+ "mean_token_accuracy": 0.9811685383319855,
1401
+ "num_tokens": 28146943.0,
1402
+ "step": 1370
1403
+ },
1404
+ {
1405
+ "entropy": 0.874957013130188,
1406
+ "epoch": 2.541436464088398,
1407
+ "grad_norm": 0.8775497674942017,
1408
+ "learning_rate": 8.905900281263804e-05,
1409
+ "loss": 0.052434295415878296,
1410
+ "mean_token_accuracy": 0.9795438170433044,
1411
+ "num_tokens": 28352640.0,
1412
+ "step": 1380
1413
+ },
1414
+ {
1415
+ "entropy": 0.8776536166667939,
1416
+ "epoch": 2.559852670349908,
1417
+ "grad_norm": 0.8895741105079651,
1418
+ "learning_rate": 8.8868155962915e-05,
1419
+ "loss": 0.05282890796661377,
1420
+ "mean_token_accuracy": 0.9790538609027862,
1421
+ "num_tokens": 28558153.0,
1422
+ "step": 1390
1423
+ },
1424
+ {
1425
+ "entropy": 0.8738743245601654,
1426
+ "epoch": 2.578268876611418,
1427
+ "grad_norm": 0.788800060749054,
1428
+ "learning_rate": 8.867586723137906e-05,
1429
+ "loss": 0.048841872811317445,
1430
+ "mean_token_accuracy": 0.9809149026870727,
1431
+ "num_tokens": 28763613.0,
1432
+ "step": 1400
1433
+ },
1434
+ {
1435
+ "entropy": 0.8750253796577454,
1436
+ "epoch": 2.596685082872928,
1437
+ "grad_norm": 0.8738002777099609,
1438
+ "learning_rate": 8.848214375131497e-05,
1439
+ "loss": 0.048261132836341855,
1440
+ "mean_token_accuracy": 0.980789190530777,
1441
+ "num_tokens": 28969248.0,
1442
+ "step": 1410
1443
+ },
1444
+ {
1445
+ "entropy": 0.8624245524406433,
1446
+ "epoch": 2.6151012891344383,
1447
+ "grad_norm": 0.6404895186424255,
1448
+ "learning_rate": 8.828699270923196e-05,
1449
+ "loss": 0.04970468282699585,
1450
+ "mean_token_accuracy": 0.9807762265205383,
1451
+ "num_tokens": 29174779.0,
1452
+ "step": 1420
1453
+ },
1454
+ {
1455
+ "entropy": 0.8792938470840455,
1456
+ "epoch": 2.6335174953959486,
1457
+ "grad_norm": 0.7856965661048889,
1458
+ "learning_rate": 8.80904213445972e-05,
1459
+ "loss": 0.053334391117095946,
1460
+ "mean_token_accuracy": 0.9790222108364105,
1461
+ "num_tokens": 29380474.0,
1462
+ "step": 1430
1463
+ },
1464
+ {
1465
+ "entropy": 0.8831034600734711,
1466
+ "epoch": 2.6519337016574585,
1467
+ "grad_norm": 0.7739618420600891,
1468
+ "learning_rate": 8.789243694956716e-05,
1469
+ "loss": 0.04959054589271546,
1470
+ "mean_token_accuracy": 0.9803965091705322,
1471
+ "num_tokens": 29585985.0,
1472
+ "step": 1440
1473
+ },
1474
+ {
1475
+ "entropy": 0.8934672951698304,
1476
+ "epoch": 2.6703499079189688,
1477
+ "grad_norm": 0.6999697089195251,
1478
+ "learning_rate": 8.769304686871719e-05,
1479
+ "loss": 0.05165250301361084,
1480
+ "mean_token_accuracy": 0.9798884153366089,
1481
+ "num_tokens": 29791238.0,
1482
+ "step": 1450
1483
+ },
1484
+ {
1485
+ "entropy": 0.9053199410438537,
1486
+ "epoch": 2.6887661141804786,
1487
+ "grad_norm": 0.9199564456939697,
1488
+ "learning_rate": 8.749225849876892e-05,
1489
+ "loss": 0.04924143850803375,
1490
+ "mean_token_accuracy": 0.9810785710811615,
1491
+ "num_tokens": 29996589.0,
1492
+ "step": 1460
1493
+ },
1494
+ {
1495
+ "entropy": 0.888091403245926,
1496
+ "epoch": 2.707182320441989,
1497
+ "grad_norm": 0.7480106353759766,
1498
+ "learning_rate": 8.729007928831597e-05,
1499
+ "loss": 0.04948916733264923,
1500
+ "mean_token_accuracy": 0.9809579730033875,
1501
+ "num_tokens": 30201875.0,
1502
+ "step": 1470
1503
+ },
1504
+ {
1505
+ "entropy": 0.8723407983779907,
1506
+ "epoch": 2.7255985267034992,
1507
+ "grad_norm": 0.9506945013999939,
1508
+ "learning_rate": 8.708651673754763e-05,
1509
+ "loss": 0.048927539587020875,
1510
+ "mean_token_accuracy": 0.980553150177002,
1511
+ "num_tokens": 30407550.0,
1512
+ "step": 1480
1513
+ },
1514
+ {
1515
+ "entropy": 0.8737521529197693,
1516
+ "epoch": 2.744014732965009,
1517
+ "grad_norm": 0.8015706539154053,
1518
+ "learning_rate": 8.688157839797062e-05,
1519
+ "loss": 0.04963063597679138,
1520
+ "mean_token_accuracy": 0.9809738755226135,
1521
+ "num_tokens": 30612839.0,
1522
+ "step": 1490
1523
+ },
1524
+ {
1525
+ "entropy": 0.8800762951374054,
1526
+ "epoch": 2.7624309392265194,
1527
+ "grad_norm": 0.9429986476898193,
1528
+ "learning_rate": 8.667527187212885e-05,
1529
+ "loss": 0.0524174690246582,
1530
+ "mean_token_accuracy": 0.9788767337799072,
1531
+ "num_tokens": 30818578.0,
1532
+ "step": 1500
1533
+ },
1534
+ {
1535
+ "entropy": 0.8871055901050567,
1536
+ "epoch": 2.7808471454880292,
1537
+ "grad_norm": 0.5909196138381958,
1538
+ "learning_rate": 8.646760481332157e-05,
1539
+ "loss": 0.05166680812835693,
1540
+ "mean_token_accuracy": 0.980216771364212,
1541
+ "num_tokens": 31023829.0,
1542
+ "step": 1510
1543
+ },
1544
+ {
1545
+ "entropy": 0.8908755779266357,
1546
+ "epoch": 2.7992633517495396,
1547
+ "grad_norm": 0.9154611229896545,
1548
+ "learning_rate": 8.625858492531931e-05,
1549
+ "loss": 0.04951836466789246,
1550
+ "mean_token_accuracy": 0.9801484227180481,
1551
+ "num_tokens": 31229635.0,
1552
+ "step": 1520
1553
+ },
1554
+ {
1555
+ "entropy": 0.92480548620224,
1556
+ "epoch": 2.81767955801105,
1557
+ "grad_norm": 0.5989938378334045,
1558
+ "learning_rate": 8.604821996207819e-05,
1559
+ "loss": 0.04799881279468536,
1560
+ "mean_token_accuracy": 0.9817522585391998,
1561
+ "num_tokens": 31435456.0,
1562
+ "step": 1530
1563
+ },
1564
+ {
1565
+ "entropy": 0.9173881888389588,
1566
+ "epoch": 2.8360957642725597,
1567
+ "grad_norm": 0.899413526058197,
1568
+ "learning_rate": 8.58365177274522e-05,
1569
+ "loss": 0.0487445592880249,
1570
+ "mean_token_accuracy": 0.9812625288963318,
1571
+ "num_tokens": 31640904.0,
1572
+ "step": 1540
1573
+ },
1574
+ {
1575
+ "entropy": 0.9076135993003845,
1576
+ "epoch": 2.85451197053407,
1577
+ "grad_norm": 0.8494166135787964,
1578
+ "learning_rate": 8.562348607490376e-05,
1579
+ "loss": 0.05005228519439697,
1580
+ "mean_token_accuracy": 0.9806681036949157,
1581
+ "num_tokens": 31845807.0,
1582
+ "step": 1550
1583
+ },
1584
+ {
1585
+ "entropy": 0.9092245221138,
1586
+ "epoch": 2.87292817679558,
1587
+ "grad_norm": 0.8225123286247253,
1588
+ "learning_rate": 8.540913290721234e-05,
1589
+ "loss": 0.048654764890670776,
1590
+ "mean_token_accuracy": 0.9805659353733063,
1591
+ "num_tokens": 32051523.0,
1592
+ "step": 1560
1593
+ },
1594
+ {
1595
+ "entropy": 0.9062779664993286,
1596
+ "epoch": 2.89134438305709,
1597
+ "grad_norm": 0.7074014544487,
1598
+ "learning_rate": 8.519346617618134e-05,
1599
+ "loss": 0.049209845066070554,
1600
+ "mean_token_accuracy": 0.9807434439659118,
1601
+ "num_tokens": 32256895.0,
1602
+ "step": 1570
1603
+ },
1604
+ {
1605
+ "entropy": 0.9190246641635895,
1606
+ "epoch": 2.9097605893186005,
1607
+ "grad_norm": 0.8860642910003662,
1608
+ "learning_rate": 8.497649388234304e-05,
1609
+ "loss": 0.051211881637573245,
1610
+ "mean_token_accuracy": 0.9802342295646668,
1611
+ "num_tokens": 32462031.0,
1612
+ "step": 1580
1613
+ },
1614
+ {
1615
+ "entropy": 0.9088015079498291,
1616
+ "epoch": 2.9281767955801103,
1617
+ "grad_norm": 0.8062726855278015,
1618
+ "learning_rate": 8.475822407466188e-05,
1619
+ "loss": 0.053512704372406,
1620
+ "mean_token_accuracy": 0.979486483335495,
1621
+ "num_tokens": 32667533.0,
1622
+ "step": 1590
1623
+ },
1624
+ {
1625
+ "entropy": 0.9462027847766876,
1626
+ "epoch": 2.9465930018416207,
1627
+ "grad_norm": 0.7962909936904907,
1628
+ "learning_rate": 8.453866485023579e-05,
1629
+ "loss": 0.0501457154750824,
1630
+ "mean_token_accuracy": 0.9803222417831421,
1631
+ "num_tokens": 32872900.0,
1632
+ "step": 1600
1633
+ },
1634
+ {
1635
+ "entropy": 0.9671471297740937,
1636
+ "epoch": 2.9650092081031305,
1637
+ "grad_norm": 0.7641744017601013,
1638
+ "learning_rate": 8.431782435399587e-05,
1639
+ "loss": 0.04629061222076416,
1640
+ "mean_token_accuracy": 0.9823175370693207,
1641
+ "num_tokens": 33077850.0,
1642
+ "step": 1610
1643
+ },
1644
+ {
1645
+ "entropy": 0.955865204334259,
1646
+ "epoch": 2.983425414364641,
1647
+ "grad_norm": 0.6772348880767822,
1648
+ "learning_rate": 8.409571077840426e-05,
1649
+ "loss": 0.048368623852729796,
1650
+ "mean_token_accuracy": 0.9808700799942016,
1651
+ "num_tokens": 33283117.0,
1652
+ "step": 1620
1653
+ },
1654
+ {
1655
+ "epoch": 3.0,
1656
+ "eval_entropy": 0.9563225186389426,
1657
+ "eval_loss": 0.059064481407403946,
1658
+ "eval_mean_token_accuracy": 0.9773589429648026,
1659
+ "eval_num_tokens": 33467712.0,
1660
+ "eval_runtime": 10.1471,
1661
+ "eval_samples_per_second": 360.499,
1662
+ "eval_steps_per_second": 11.333,
1663
+ "step": 1629
1664
+ },
1665
+ {
1666
+ "entropy": 0.9337226033210755,
1667
+ "epoch": 3.001841620626151,
1668
+ "grad_norm": 0.646203875541687,
1669
+ "learning_rate": 8.387233236315016e-05,
1670
+ "loss": 0.043352216482162476,
1671
+ "mean_token_accuracy": 0.9830620110034942,
1672
+ "num_tokens": 33488302.0,
1673
+ "step": 1630
1674
+ },
1675
+ {
1676
+ "entropy": 0.9734923839569092,
1677
+ "epoch": 3.020257826887661,
1678
+ "grad_norm": 0.7564226984977722,
1679
+ "learning_rate": 8.364769739484416e-05,
1680
+ "loss": 0.033932483196258544,
1681
+ "mean_token_accuracy": 0.9872806966304779,
1682
+ "num_tokens": 33693531.0,
1683
+ "step": 1640
1684
+ },
1685
+ {
1686
+ "entropy": 0.9669206500053406,
1687
+ "epoch": 3.0386740331491713,
1688
+ "grad_norm": 0.7126886248588562,
1689
+ "learning_rate": 8.342181420671096e-05,
1690
+ "loss": 0.03818287253379822,
1691
+ "mean_token_accuracy": 0.9852082908153534,
1692
+ "num_tokens": 33899305.0,
1693
+ "step": 1650
1694
+ },
1695
+ {
1696
+ "entropy": 0.9522916138172149,
1697
+ "epoch": 3.0570902394106816,
1698
+ "grad_norm": 1.0571653842926025,
1699
+ "learning_rate": 8.319469117828007e-05,
1700
+ "loss": 0.03456039130687714,
1701
+ "mean_token_accuracy": 0.9867027878761292,
1702
+ "num_tokens": 34104585.0,
1703
+ "step": 1660
1704
+ },
1705
+ {
1706
+ "entropy": 0.9568560004234314,
1707
+ "epoch": 3.0755064456721914,
1708
+ "grad_norm": 0.780940592288971,
1709
+ "learning_rate": 8.296633673507505e-05,
1710
+ "loss": 0.03551802039146423,
1711
+ "mean_token_accuracy": 0.9867531359195709,
1712
+ "num_tokens": 34309516.0,
1713
+ "step": 1670
1714
+ },
1715
+ {
1716
+ "entropy": 0.9590656876564025,
1717
+ "epoch": 3.0939226519337018,
1718
+ "grad_norm": 0.8330219388008118,
1719
+ "learning_rate": 8.273675934830094e-05,
1720
+ "loss": 0.03674865961074829,
1721
+ "mean_token_accuracy": 0.9864118576049805,
1722
+ "num_tokens": 34515170.0,
1723
+ "step": 1680
1724
+ },
1725
+ {
1726
+ "entropy": 0.975881814956665,
1727
+ "epoch": 3.1123388581952116,
1728
+ "grad_norm": 0.7010637521743774,
1729
+ "learning_rate": 8.250596753453e-05,
1730
+ "loss": 0.03550414443016052,
1731
+ "mean_token_accuracy": 0.9864102602005005,
1732
+ "num_tokens": 34720896.0,
1733
+ "step": 1690
1734
+ },
1735
+ {
1736
+ "entropy": 0.9599562883377075,
1737
+ "epoch": 3.130755064456722,
1738
+ "grad_norm": 0.6694278717041016,
1739
+ "learning_rate": 8.227396985538578e-05,
1740
+ "loss": 0.035564273595809937,
1741
+ "mean_token_accuracy": 0.9867321848869324,
1742
+ "num_tokens": 34925970.0,
1743
+ "step": 1700
1744
+ },
1745
+ {
1746
+ "entropy": 0.9582216143608093,
1747
+ "epoch": 3.149171270718232,
1748
+ "grad_norm": 0.9333199262619019,
1749
+ "learning_rate": 8.204077491722546e-05,
1750
+ "loss": 0.035575729608535764,
1751
+ "mean_token_accuracy": 0.9862452208995819,
1752
+ "num_tokens": 35131543.0,
1753
+ "step": 1710
1754
+ },
1755
+ {
1756
+ "entropy": 0.9579678058624268,
1757
+ "epoch": 3.167587476979742,
1758
+ "grad_norm": 0.9450218081474304,
1759
+ "learning_rate": 8.180639137082066e-05,
1760
+ "loss": 0.0385298490524292,
1761
+ "mean_token_accuracy": 0.98538036942482,
1762
+ "num_tokens": 35336790.0,
1763
+ "step": 1720
1764
+ },
1765
+ {
1766
+ "entropy": 0.9640831351280212,
1767
+ "epoch": 3.1860036832412524,
1768
+ "grad_norm": 0.8551534414291382,
1769
+ "learning_rate": 8.157082791103649e-05,
1770
+ "loss": 0.03702138364315033,
1771
+ "mean_token_accuracy": 0.9852015495300293,
1772
+ "num_tokens": 35542294.0,
1773
+ "step": 1730
1774
+ },
1775
+ {
1776
+ "entropy": 0.9867071211338043,
1777
+ "epoch": 3.2044198895027622,
1778
+ "grad_norm": 0.7138128876686096,
1779
+ "learning_rate": 8.133409327650897e-05,
1780
+ "loss": 0.035626694560050964,
1781
+ "mean_token_accuracy": 0.986064875125885,
1782
+ "num_tokens": 35747447.0,
1783
+ "step": 1740
1784
+ },
1785
+ {
1786
+ "entropy": 0.9639089345932007,
1787
+ "epoch": 3.2228360957642725,
1788
+ "grad_norm": 0.7131415009498596,
1789
+ "learning_rate": 8.109619624932092e-05,
1790
+ "loss": 0.035885071754455565,
1791
+ "mean_token_accuracy": 0.986273056268692,
1792
+ "num_tokens": 35952258.0,
1793
+ "step": 1750
1794
+ },
1795
+ {
1796
+ "entropy": 0.9516046345233917,
1797
+ "epoch": 3.241252302025783,
1798
+ "grad_norm": 0.6900200843811035,
1799
+ "learning_rate": 8.085714565467611e-05,
1800
+ "loss": 0.03535219430923462,
1801
+ "mean_token_accuracy": 0.985836285352707,
1802
+ "num_tokens": 36157938.0,
1803
+ "step": 1760
1804
+ },
1805
+ {
1806
+ "entropy": 0.9373646557331086,
1807
+ "epoch": 3.2596685082872927,
1808
+ "grad_norm": 0.6101690530776978,
1809
+ "learning_rate": 8.061695036057191e-05,
1810
+ "loss": 0.034940996766090394,
1811
+ "mean_token_accuracy": 0.9863743901252746,
1812
+ "num_tokens": 36363825.0,
1813
+ "step": 1770
1814
+ },
1815
+ {
1816
+ "entropy": 0.9444344758987426,
1817
+ "epoch": 3.278084714548803,
1818
+ "grad_norm": 0.7518529295921326,
1819
+ "learning_rate": 8.03756192774703e-05,
1820
+ "loss": 0.03404279053211212,
1821
+ "mean_token_accuracy": 0.9866396844387054,
1822
+ "num_tokens": 36568961.0,
1823
+ "step": 1780
1824
+ },
1825
+ {
1826
+ "entropy": 0.9550357758998871,
1827
+ "epoch": 3.2965009208103133,
1828
+ "grad_norm": 0.7687555551528931,
1829
+ "learning_rate": 8.013316135796734e-05,
1830
+ "loss": 0.038447052240371704,
1831
+ "mean_token_accuracy": 0.985325163602829,
1832
+ "num_tokens": 36774514.0,
1833
+ "step": 1790
1834
+ },
1835
+ {
1836
+ "entropy": 0.9477231681346894,
1837
+ "epoch": 3.314917127071823,
1838
+ "grad_norm": 0.7521633505821228,
1839
+ "learning_rate": 7.988958559646102e-05,
1840
+ "loss": 0.03746694028377533,
1841
+ "mean_token_accuracy": 0.9853165090084076,
1842
+ "num_tokens": 36979660.0,
1843
+ "step": 1800
1844
+ },
1845
+ {
1846
+ "entropy": 0.925805002450943,
1847
+ "epoch": 3.3333333333333335,
1848
+ "grad_norm": 0.9333297610282898,
1849
+ "learning_rate": 7.964490102881768e-05,
1850
+ "loss": 0.03700103759765625,
1851
+ "mean_token_accuracy": 0.9850880861282348,
1852
+ "num_tokens": 37185191.0,
1853
+ "step": 1810
1854
+ },
1855
+ {
1856
+ "entropy": 0.9225482225418091,
1857
+ "epoch": 3.3517495395948433,
1858
+ "grad_norm": 0.7928622961044312,
1859
+ "learning_rate": 7.939911673203665e-05,
1860
+ "loss": 0.03825801610946655,
1861
+ "mean_token_accuracy": 0.9850241422653199,
1862
+ "num_tokens": 37390749.0,
1863
+ "step": 1820
1864
+ },
1865
+ {
1866
+ "entropy": 0.9597147881984711,
1867
+ "epoch": 3.3701657458563536,
1868
+ "grad_norm": 0.7658583521842957,
1869
+ "learning_rate": 7.915224182391375e-05,
1870
+ "loss": 0.039855146408081056,
1871
+ "mean_token_accuracy": 0.9845879554748536,
1872
+ "num_tokens": 37596052.0,
1873
+ "step": 1830
1874
+ },
1875
+ {
1876
+ "entropy": 0.9485619068145752,
1877
+ "epoch": 3.388581952117864,
1878
+ "grad_norm": 0.8492130637168884,
1879
+ "learning_rate": 7.890428546270278e-05,
1880
+ "loss": 0.039359599351882935,
1881
+ "mean_token_accuracy": 0.9847265422344208,
1882
+ "num_tokens": 37802063.0,
1883
+ "step": 1840
1884
+ },
1885
+ {
1886
+ "entropy": 0.9670301914215088,
1887
+ "epoch": 3.406998158379374,
1888
+ "grad_norm": 0.7527599930763245,
1889
+ "learning_rate": 7.865525684677608e-05,
1890
+ "loss": 0.03752985596656799,
1891
+ "mean_token_accuracy": 0.9855137526988983,
1892
+ "num_tokens": 38007432.0,
1893
+ "step": 1850
1894
+ },
1895
+ {
1896
+ "entropy": 0.9681244969367981,
1897
+ "epoch": 3.425414364640884,
1898
+ "grad_norm": 0.7599612474441528,
1899
+ "learning_rate": 7.840516521428303e-05,
1900
+ "loss": 0.03653894364833832,
1901
+ "mean_token_accuracy": 0.9858933389186859,
1902
+ "num_tokens": 38212923.0,
1903
+ "step": 1860
1904
+ },
1905
+ {
1906
+ "entropy": 0.9706049561500549,
1907
+ "epoch": 3.443830570902394,
1908
+ "grad_norm": 0.7678127884864807,
1909
+ "learning_rate": 7.815401984280748e-05,
1910
+ "loss": 0.0366938978433609,
1911
+ "mean_token_accuracy": 0.9854713797569274,
1912
+ "num_tokens": 38418422.0,
1913
+ "step": 1870
1914
+ },
1915
+ {
1916
+ "entropy": 0.9637093842029572,
1917
+ "epoch": 3.4622467771639043,
1918
+ "grad_norm": 0.762824535369873,
1919
+ "learning_rate": 7.790183004902359e-05,
1920
+ "loss": 0.03516915142536163,
1921
+ "mean_token_accuracy": 0.9866003453731537,
1922
+ "num_tokens": 38624389.0,
1923
+ "step": 1880
1924
+ },
1925
+ {
1926
+ "entropy": 0.9373565018177032,
1927
+ "epoch": 3.4806629834254146,
1928
+ "grad_norm": 0.8221780061721802,
1929
+ "learning_rate": 7.764860518835014e-05,
1930
+ "loss": 0.04049026966094971,
1931
+ "mean_token_accuracy": 0.984089481830597,
1932
+ "num_tokens": 38829654.0,
1933
+ "step": 1890
1934
+ },
1935
+ {
1936
+ "entropy": 0.9356025457382202,
1937
+ "epoch": 3.4990791896869244,
1938
+ "grad_norm": 0.7583426237106323,
1939
+ "learning_rate": 7.739435465460356e-05,
1940
+ "loss": 0.03658481240272522,
1941
+ "mean_token_accuracy": 0.9857318818569183,
1942
+ "num_tokens": 39034638.0,
1943
+ "step": 1900
1944
+ },
1945
+ {
1946
+ "entropy": 0.9740163326263428,
1947
+ "epoch": 3.5174953959484347,
1948
+ "grad_norm": 0.7332878112792969,
1949
+ "learning_rate": 7.713908787964937e-05,
1950
+ "loss": 0.03508963882923126,
1951
+ "mean_token_accuracy": 0.9863419532775879,
1952
+ "num_tokens": 39240265.0,
1953
+ "step": 1910
1954
+ },
1955
+ {
1956
+ "entropy": 0.9528286933898926,
1957
+ "epoch": 3.5359116022099446,
1958
+ "grad_norm": 0.6515451669692993,
1959
+ "learning_rate": 7.688281433305233e-05,
1960
+ "loss": 0.036055779457092284,
1961
+ "mean_token_accuracy": 0.9860979080200195,
1962
+ "num_tokens": 39445546.0,
1963
+ "step": 1920
1964
+ },
1965
+ {
1966
+ "entropy": 0.9480705261230469,
1967
+ "epoch": 3.554327808471455,
1968
+ "grad_norm": 0.7725827097892761,
1969
+ "learning_rate": 7.662554352172515e-05,
1970
+ "loss": 0.037101513147354125,
1971
+ "mean_token_accuracy": 0.985782790184021,
1972
+ "num_tokens": 39651078.0,
1973
+ "step": 1930
1974
+ },
1975
+ {
1976
+ "entropy": 0.9655321061611175,
1977
+ "epoch": 3.572744014732965,
1978
+ "grad_norm": 0.7756506204605103,
1979
+ "learning_rate": 7.636728498957581e-05,
1980
+ "loss": 0.03721855878829956,
1981
+ "mean_token_accuracy": 0.9857951939105988,
1982
+ "num_tokens": 39856542.0,
1983
+ "step": 1940
1984
+ },
1985
+ {
1986
+ "entropy": 0.9772682309150695,
1987
+ "epoch": 3.591160220994475,
1988
+ "grad_norm": 0.9084987640380859,
1989
+ "learning_rate": 7.610804831715355e-05,
1990
+ "loss": 0.03570749163627625,
1991
+ "mean_token_accuracy": 0.9863450109958649,
1992
+ "num_tokens": 40061913.0,
1993
+ "step": 1950
1994
+ },
1995
+ {
1996
+ "entropy": 0.9579685389995575,
1997
+ "epoch": 3.6095764272559854,
1998
+ "grad_norm": 0.6358487606048584,
1999
+ "learning_rate": 7.584784312129334e-05,
2000
+ "loss": 0.038210684061050416,
2001
+ "mean_token_accuracy": 0.9850837290287018,
2002
+ "num_tokens": 40267398.0,
2003
+ "step": 1960
2004
+ },
2005
+ {
2006
+ "entropy": 0.9605201721191406,
2007
+ "epoch": 3.6279926335174952,
2008
+ "grad_norm": 0.6263149976730347,
2009
+ "learning_rate": 7.558667905475927e-05,
2010
+ "loss": 0.03509160876274109,
2011
+ "mean_token_accuracy": 0.9868143379688263,
2012
+ "num_tokens": 40472827.0,
2013
+ "step": 1970
2014
+ },
2015
+ {
2016
+ "entropy": 0.964026153087616,
2017
+ "epoch": 3.6464088397790055,
2018
+ "grad_norm": 0.90068119764328,
2019
+ "learning_rate": 7.532456580588638e-05,
2020
+ "loss": 0.036211782693862916,
2021
+ "mean_token_accuracy": 0.9858468770980835,
2022
+ "num_tokens": 40677935.0,
2023
+ "step": 1980
2024
+ },
2025
+ {
2026
+ "entropy": 0.9494135618209839,
2027
+ "epoch": 3.664825046040516,
2028
+ "grad_norm": 0.760134756565094,
2029
+ "learning_rate": 7.50615130982213e-05,
2030
+ "loss": 0.03786201477050781,
2031
+ "mean_token_accuracy": 0.9852500438690186,
2032
+ "num_tokens": 40883750.0,
2033
+ "step": 1990
2034
+ },
2035
+ {
2036
+ "entropy": 0.9527071297168732,
2037
+ "epoch": 3.6832412523020257,
2038
+ "grad_norm": 0.9812107682228088,
2039
+ "learning_rate": 7.479753069016152e-05,
2040
+ "loss": 0.03803159594535828,
2041
+ "mean_token_accuracy": 0.9852405369281769,
2042
+ "num_tokens": 41089115.0,
2043
+ "step": 2000
2044
+ },
2045
+ {
2046
+ "entropy": 0.9639330863952636,
2047
+ "epoch": 3.701657458563536,
2048
+ "grad_norm": 0.7164933681488037,
2049
+ "learning_rate": 7.453262837459332e-05,
2050
+ "loss": 0.03912568986415863,
2051
+ "mean_token_accuracy": 0.9849458575248718,
2052
+ "num_tokens": 41294694.0,
2053
+ "step": 2010
2054
+ },
2055
+ {
2056
+ "entropy": 0.9536987483501435,
2057
+ "epoch": 3.720073664825046,
2058
+ "grad_norm": 0.6804596185684204,
2059
+ "learning_rate": 7.426681597852863e-05,
2060
+ "loss": 0.036410006880760196,
2061
+ "mean_token_accuracy": 0.985712206363678,
2062
+ "num_tokens": 41499817.0,
2063
+ "step": 2020
2064
+ },
2065
+ {
2066
+ "entropy": 0.9478164672851562,
2067
+ "epoch": 3.738489871086556,
2068
+ "grad_norm": 0.8799397349357605,
2069
+ "learning_rate": 7.400010336274037e-05,
2070
+ "loss": 0.03801035583019256,
2071
+ "mean_token_accuracy": 0.9850274682044983,
2072
+ "num_tokens": 41704932.0,
2073
+ "step": 2030
2074
+ },
2075
+ {
2076
+ "entropy": 0.9383447647094727,
2077
+ "epoch": 3.7569060773480665,
2078
+ "grad_norm": 0.8386216163635254,
2079
+ "learning_rate": 7.373250042139664e-05,
2080
+ "loss": 0.0373637855052948,
2081
+ "mean_token_accuracy": 0.9854822158813477,
2082
+ "num_tokens": 41910804.0,
2083
+ "step": 2040
2084
+ },
2085
+ {
2086
+ "entropy": 0.925172996520996,
2087
+ "epoch": 3.7753222836095763,
2088
+ "grad_norm": 0.7599324584007263,
2089
+ "learning_rate": 7.346401708169377e-05,
2090
+ "loss": 0.03585260808467865,
2091
+ "mean_token_accuracy": 0.9860672950744629,
2092
+ "num_tokens": 42116706.0,
2093
+ "step": 2050
2094
+ },
2095
+ {
2096
+ "entropy": 0.9463765442371368,
2097
+ "epoch": 3.7937384898710866,
2098
+ "grad_norm": 0.9030149579048157,
2099
+ "learning_rate": 7.319466330348797e-05,
2100
+ "loss": 0.035877206921577455,
2101
+ "mean_token_accuracy": 0.9863968968391419,
2102
+ "num_tokens": 42322670.0,
2103
+ "step": 2060
2104
+ },
2105
+ {
2106
+ "entropy": 0.9942441761493683,
2107
+ "epoch": 3.8121546961325965,
2108
+ "grad_norm": 0.6400449275970459,
2109
+ "learning_rate": 7.292444907892587e-05,
2110
+ "loss": 0.037310433387756345,
2111
+ "mean_token_accuracy": 0.9854151606559753,
2112
+ "num_tokens": 42527752.0,
2113
+ "step": 2070
2114
+ },
2115
+ {
2116
+ "entropy": 0.9577703952789307,
2117
+ "epoch": 3.830570902394107,
2118
+ "grad_norm": 0.6193167567253113,
2119
+ "learning_rate": 7.265338443207387e-05,
2120
+ "loss": 0.03648848831653595,
2121
+ "mean_token_accuracy": 0.9856530070304871,
2122
+ "num_tokens": 42732981.0,
2123
+ "step": 2080
2124
+ },
2125
+ {
2126
+ "entropy": 0.9663952767848969,
2127
+ "epoch": 3.848987108655617,
2128
+ "grad_norm": 0.759611189365387,
2129
+ "learning_rate": 7.238147941854625e-05,
2130
+ "loss": 0.036112996935844424,
2131
+ "mean_token_accuracy": 0.9862765550613404,
2132
+ "num_tokens": 42938619.0,
2133
+ "step": 2090
2134
+ },
2135
+ {
2136
+ "entropy": 0.9484863519668579,
2137
+ "epoch": 3.867403314917127,
2138
+ "grad_norm": 0.7420705556869507,
2139
+ "learning_rate": 7.210874412513218e-05,
2140
+ "loss": 0.03703283965587616,
2141
+ "mean_token_accuracy": 0.9857317566871643,
2142
+ "num_tokens": 43143753.0,
2143
+ "step": 2100
2144
+ },
2145
+ {
2146
+ "entropy": 0.964326673746109,
2147
+ "epoch": 3.8858195211786373,
2148
+ "grad_norm": 0.8779639601707458,
2149
+ "learning_rate": 7.183518866942147e-05,
2150
+ "loss": 0.03739701807498932,
2151
+ "mean_token_accuracy": 0.9852154791355133,
2152
+ "num_tokens": 43349451.0,
2153
+ "step": 2110
2154
+ },
2155
+ {
2156
+ "entropy": 0.9729791641235351,
2157
+ "epoch": 3.904235727440147,
2158
+ "grad_norm": 0.7582741379737854,
2159
+ "learning_rate": 7.156082319942929e-05,
2160
+ "loss": 0.03894525766372681,
2161
+ "mean_token_accuracy": 0.9847454309463501,
2162
+ "num_tokens": 43554598.0,
2163
+ "step": 2120
2164
+ },
2165
+ {
2166
+ "entropy": 0.9860592544078827,
2167
+ "epoch": 3.9226519337016574,
2168
+ "grad_norm": 0.860698938369751,
2169
+ "learning_rate": 7.128565789321969e-05,
2170
+ "loss": 0.0365300178527832,
2171
+ "mean_token_accuracy": 0.9859121859073638,
2172
+ "num_tokens": 43760081.0,
2173
+ "step": 2130
2174
+ },
2175
+ {
2176
+ "entropy": 0.9916551172733307,
2177
+ "epoch": 3.9410681399631677,
2178
+ "grad_norm": 0.8363776206970215,
2179
+ "learning_rate": 7.100970295852805e-05,
2180
+ "loss": 0.036221379041671754,
2181
+ "mean_token_accuracy": 0.9859034180641174,
2182
+ "num_tokens": 43965432.0,
2183
+ "step": 2140
2184
+ },
2185
+ {
2186
+ "entropy": 0.9553558886051178,
2187
+ "epoch": 3.9594843462246776,
2188
+ "grad_norm": 0.9627474546432495,
2189
+ "learning_rate": 7.073296863238242e-05,
2190
+ "loss": 0.03684481382369995,
2191
+ "mean_token_accuracy": 0.9857315957546234,
2192
+ "num_tokens": 44171232.0,
2193
+ "step": 2150
2194
+ },
2195
+ {
2196
+ "entropy": 0.9538035809993743,
2197
+ "epoch": 3.977900552486188,
2198
+ "grad_norm": 0.8399474620819092,
2199
+ "learning_rate": 7.045546518072366e-05,
2200
+ "loss": 0.03825397789478302,
2201
+ "mean_token_accuracy": 0.9846831560134888,
2202
+ "num_tokens": 44376723.0,
2203
+ "step": 2160
2204
+ },
2205
+ {
2206
+ "entropy": 0.9476235210895538,
2207
+ "epoch": 3.9963167587476978,
2208
+ "grad_norm": 0.708739697933197,
2209
+ "learning_rate": 7.017720289802472e-05,
2210
+ "loss": 0.03618018329143524,
2211
+ "mean_token_accuracy": 0.9861325800418854,
2212
+ "num_tokens": 44582407.0,
2213
+ "step": 2170
2214
+ },
2215
+ {
2216
+ "epoch": 4.0,
2217
+ "eval_entropy": 0.9569619194321011,
2218
+ "eval_loss": 0.059838198125362396,
2219
+ "eval_mean_token_accuracy": 0.9777795366618944,
2220
+ "eval_num_tokens": 44623647.0,
2221
+ "eval_runtime": 10.0379,
2222
+ "eval_samples_per_second": 364.42,
2223
+ "eval_steps_per_second": 11.457,
2224
+ "step": 2172
2225
+ },
2226
+ {
2227
+ "entropy": 0.9558675646781921,
2228
+ "epoch": 4.014732965009208,
2229
+ "grad_norm": 0.7347508668899536,
2230
+ "learning_rate": 6.989819210690872e-05,
2231
+ "loss": 0.02886659502983093,
2232
+ "mean_token_accuracy": 0.9892994821071625,
2233
+ "num_tokens": 44788219.0,
2234
+ "step": 2180
2235
+ },
2236
+ {
2237
+ "entropy": 1.0037677466869355,
2238
+ "epoch": 4.033149171270718,
2239
+ "grad_norm": 0.7403206825256348,
2240
+ "learning_rate": 6.961844315776596e-05,
2241
+ "loss": 0.02395295798778534,
2242
+ "mean_token_accuracy": 0.9906026899814606,
2243
+ "num_tokens": 44993505.0,
2244
+ "step": 2190
2245
+ },
2246
+ {
2247
+ "entropy": 1.0068290829658508,
2248
+ "epoch": 4.051565377532229,
2249
+ "grad_norm": 0.7979726195335388,
2250
+ "learning_rate": 6.933796642837003e-05,
2251
+ "loss": 0.02605988085269928,
2252
+ "mean_token_accuracy": 0.9899706900119781,
2253
+ "num_tokens": 45199193.0,
2254
+ "step": 2200
2255
+ },
2256
+ {
2257
+ "entropy": 0.9942211747169495,
2258
+ "epoch": 4.069981583793738,
2259
+ "grad_norm": 0.6460402011871338,
2260
+ "learning_rate": 6.905677232349278e-05,
2261
+ "loss": 0.025350230932235717,
2262
+ "mean_token_accuracy": 0.9899386286735534,
2263
+ "num_tokens": 45404030.0,
2264
+ "step": 2210
2265
+ },
2266
+ {
2267
+ "entropy": 0.9783595442771912,
2268
+ "epoch": 4.088397790055248,
2269
+ "grad_norm": 0.8177055716514587,
2270
+ "learning_rate": 6.877487127451834e-05,
2271
+ "loss": 0.02696993052959442,
2272
+ "mean_token_accuracy": 0.9896106541156768,
2273
+ "num_tokens": 45609763.0,
2274
+ "step": 2220
2275
+ },
2276
+ {
2277
+ "entropy": 0.9801763832569123,
2278
+ "epoch": 4.106813996316759,
2279
+ "grad_norm": 0.6608165502548218,
2280
+ "learning_rate": 6.849227373905618e-05,
2281
+ "loss": 0.025101393461227417,
2282
+ "mean_token_accuracy": 0.9904372334480286,
2283
+ "num_tokens": 45814941.0,
2284
+ "step": 2230
2285
+ },
2286
+ {
2287
+ "entropy": 0.9695689737796783,
2288
+ "epoch": 4.125230202578269,
2289
+ "grad_norm": 0.8036547899246216,
2290
+ "learning_rate": 6.820899020055314e-05,
2291
+ "loss": 0.027827343344688414,
2292
+ "mean_token_accuracy": 0.9890337705612182,
2293
+ "num_tokens": 46020535.0,
2294
+ "step": 2240
2295
+ },
2296
+ {
2297
+ "entropy": 0.9828635334968567,
2298
+ "epoch": 4.143646408839779,
2299
+ "grad_norm": 0.7729921936988831,
2300
+ "learning_rate": 6.792503116790455e-05,
2301
+ "loss": 0.02779492735862732,
2302
+ "mean_token_accuracy": 0.9894372522830963,
2303
+ "num_tokens": 46226013.0,
2304
+ "step": 2250
2305
+ },
2306
+ {
2307
+ "entropy": 0.9978842556476593,
2308
+ "epoch": 4.162062615101289,
2309
+ "grad_norm": 0.7334664463996887,
2310
+ "learning_rate": 6.764040717506432e-05,
2311
+ "loss": 0.025673511624336242,
2312
+ "mean_token_accuracy": 0.9899355113506317,
2313
+ "num_tokens": 46432087.0,
2314
+ "step": 2260
2315
+ },
2316
+ {
2317
+ "entropy": 1.0116403937339782,
2318
+ "epoch": 4.180478821362799,
2319
+ "grad_norm": 0.6769368052482605,
2320
+ "learning_rate": 6.735512878065427e-05,
2321
+ "loss": 0.024705511331558228,
2322
+ "mean_token_accuracy": 0.9906128525733948,
2323
+ "num_tokens": 46637478.0,
2324
+ "step": 2270
2325
+ },
2326
+ {
2327
+ "entropy": 0.9985016226768494,
2328
+ "epoch": 4.198895027624309,
2329
+ "grad_norm": 0.8301573991775513,
2330
+ "learning_rate": 6.706920656757234e-05,
2331
+ "loss": 0.02455987185239792,
2332
+ "mean_token_accuracy": 0.9905728340148926,
2333
+ "num_tokens": 46842562.0,
2334
+ "step": 2280
2335
+ },
2336
+ {
2337
+ "entropy": 0.9909430682659149,
2338
+ "epoch": 4.21731123388582,
2339
+ "grad_norm": 0.656026303768158,
2340
+ "learning_rate": 6.67826511426001e-05,
2341
+ "loss": 0.022711564600467683,
2342
+ "mean_token_accuracy": 0.9910893619060517,
2343
+ "num_tokens": 47048071.0,
2344
+ "step": 2290
2345
+ },
2346
+ {
2347
+ "entropy": 0.9868666052818298,
2348
+ "epoch": 4.23572744014733,
2349
+ "grad_norm": 0.7614991068840027,
2350
+ "learning_rate": 6.649547313600916e-05,
2351
+ "loss": 0.02453812211751938,
2352
+ "mean_token_accuracy": 0.9908901154994965,
2353
+ "num_tokens": 47253507.0,
2354
+ "step": 2300
2355
+ },
2356
+ {
2357
+ "entropy": 0.9870487153530121,
2358
+ "epoch": 4.25414364640884,
2359
+ "grad_norm": 0.7617276906967163,
2360
+ "learning_rate": 6.62076832011669e-05,
2361
+ "loss": 0.025818097591400146,
2362
+ "mean_token_accuracy": 0.990347957611084,
2363
+ "num_tokens": 47458747.0,
2364
+ "step": 2310
2365
+ },
2366
+ {
2367
+ "entropy": 0.9691080570220947,
2368
+ "epoch": 4.27255985267035,
2369
+ "grad_norm": 0.6743029952049255,
2370
+ "learning_rate": 6.591929201414124e-05,
2371
+ "loss": 0.02456912100315094,
2372
+ "mean_token_accuracy": 0.9905289709568024,
2373
+ "num_tokens": 47663643.0,
2374
+ "step": 2320
2375
+ },
2376
+ {
2377
+ "entropy": 0.9701108932495117,
2378
+ "epoch": 4.29097605893186,
2379
+ "grad_norm": 0.6964483261108398,
2380
+ "learning_rate": 6.56303102733046e-05,
2381
+ "loss": 0.02575681209564209,
2382
+ "mean_token_accuracy": 0.9898503363132477,
2383
+ "num_tokens": 47868982.0,
2384
+ "step": 2330
2385
+ },
2386
+ {
2387
+ "entropy": 0.969528192281723,
2388
+ "epoch": 4.30939226519337,
2389
+ "grad_norm": 0.7521987557411194,
2390
+ "learning_rate": 6.5340748698937e-05,
2391
+ "loss": 0.02678089737892151,
2392
+ "mean_token_accuracy": 0.9898572087287902,
2393
+ "num_tokens": 48074314.0,
2394
+ "step": 2340
2395
+ },
2396
+ {
2397
+ "entropy": 0.9921871721744537,
2398
+ "epoch": 4.327808471454881,
2399
+ "grad_norm": 0.6944513320922852,
2400
+ "learning_rate": 6.505061803282844e-05,
2401
+ "loss": 0.025553321838378905,
2402
+ "mean_token_accuracy": 0.9907529592514038,
2403
+ "num_tokens": 48279731.0,
2404
+ "step": 2350
2405
+ },
2406
+ {
2407
+ "entropy": 0.9768964886665344,
2408
+ "epoch": 4.346224677716391,
2409
+ "grad_norm": 0.6553092002868652,
2410
+ "learning_rate": 6.47599290378803e-05,
2411
+ "loss": 0.0250235915184021,
2412
+ "mean_token_accuracy": 0.9904054701328278,
2413
+ "num_tokens": 48485401.0,
2414
+ "step": 2360
2415
+ },
2416
+ {
2417
+ "entropy": 0.9612838506698609,
2418
+ "epoch": 4.3646408839779,
2419
+ "grad_norm": 0.916820228099823,
2420
+ "learning_rate": 6.446869249770619e-05,
2421
+ "loss": 0.028156182169914244,
2422
+ "mean_token_accuracy": 0.9888657331466675,
2423
+ "num_tokens": 48691047.0,
2424
+ "step": 2370
2425
+ },
2426
+ {
2427
+ "entropy": 0.9665832936763763,
2428
+ "epoch": 4.383057090239411,
2429
+ "grad_norm": 0.9197776913642883,
2430
+ "learning_rate": 6.417691921623185e-05,
2431
+ "loss": 0.025303921103477477,
2432
+ "mean_token_accuracy": 0.989986252784729,
2433
+ "num_tokens": 48896234.0,
2434
+ "step": 2380
2435
+ },
2436
+ {
2437
+ "entropy": 0.9686589121818543,
2438
+ "epoch": 4.401473296500921,
2439
+ "grad_norm": 0.8505764603614807,
2440
+ "learning_rate": 6.388462001729434e-05,
2441
+ "loss": 0.024816396832466125,
2442
+ "mean_token_accuracy": 0.9909265041351318,
2443
+ "num_tokens": 49101893.0,
2444
+ "step": 2390
2445
+ },
2446
+ {
2447
+ "entropy": 0.9625210344791413,
2448
+ "epoch": 4.419889502762431,
2449
+ "grad_norm": 1.0601766109466553,
2450
+ "learning_rate": 6.359180574424062e-05,
2451
+ "loss": 0.02706078290939331,
2452
+ "mean_token_accuracy": 0.9895522117614746,
2453
+ "num_tokens": 49307467.0,
2454
+ "step": 2400
2455
+ },
2456
+ {
2457
+ "entropy": 0.9679551541805267,
2458
+ "epoch": 4.4383057090239415,
2459
+ "grad_norm": 0.776253879070282,
2460
+ "learning_rate": 6.329848725952514e-05,
2461
+ "loss": 0.02693203091621399,
2462
+ "mean_token_accuracy": 0.9893981635570526,
2463
+ "num_tokens": 49513020.0,
2464
+ "step": 2410
2465
+ },
2466
+ {
2467
+ "entropy": 0.9704959928989411,
2468
+ "epoch": 4.456721915285451,
2469
+ "grad_norm": 0.5459668636322021,
2470
+ "learning_rate": 6.3004675444307e-05,
2471
+ "loss": 0.0279473751783371,
2472
+ "mean_token_accuracy": 0.9894329369068146,
2473
+ "num_tokens": 49718405.0,
2474
+ "step": 2420
2475
+ },
2476
+ {
2477
+ "entropy": 0.961863350868225,
2478
+ "epoch": 4.475138121546961,
2479
+ "grad_norm": 0.9338833093643188,
2480
+ "learning_rate": 6.27103811980462e-05,
2481
+ "loss": 0.026478803157806395,
2482
+ "mean_token_accuracy": 0.9902269721031189,
2483
+ "num_tokens": 49923375.0,
2484
+ "step": 2430
2485
+ },
2486
+ {
2487
+ "entropy": 0.9708506822586059,
2488
+ "epoch": 4.4935543278084715,
2489
+ "grad_norm": 0.9073707461357117,
2490
+ "learning_rate": 6.241561543809947e-05,
2491
+ "loss": 0.025289520621299744,
2492
+ "mean_token_accuracy": 0.9904769957065582,
2493
+ "num_tokens": 50128901.0,
2494
+ "step": 2440
2495
+ },
2496
+ {
2497
+ "entropy": 0.984996622800827,
2498
+ "epoch": 4.511970534069982,
2499
+ "grad_norm": 0.8674206733703613,
2500
+ "learning_rate": 6.212038909931503e-05,
2501
+ "loss": 0.026442551612854005,
2502
+ "mean_token_accuracy": 0.9905101835727692,
2503
+ "num_tokens": 50334449.0,
2504
+ "step": 2450
2505
+ },
2506
+ {
2507
+ "entropy": 0.9926377475261688,
2508
+ "epoch": 4.530386740331492,
2509
+ "grad_norm": 0.7571811079978943,
2510
+ "learning_rate": 6.182471313362717e-05,
2511
+ "loss": 0.026819539070129395,
2512
+ "mean_token_accuracy": 0.9898989200592041,
2513
+ "num_tokens": 50539597.0,
2514
+ "step": 2460
2515
+ },
2516
+ {
2517
+ "entropy": 0.9450563549995422,
2518
+ "epoch": 4.5488029465930016,
2519
+ "grad_norm": 0.6651087403297424,
2520
+ "learning_rate": 6.15285985096498e-05,
2521
+ "loss": 0.02665227949619293,
2522
+ "mean_token_accuracy": 0.9897156655788422,
2523
+ "num_tokens": 50744926.0,
2524
+ "step": 2470
2525
+ },
2526
+ {
2527
+ "entropy": 0.9715635657310486,
2528
+ "epoch": 4.567219152854512,
2529
+ "grad_norm": 0.7445545196533203,
2530
+ "learning_rate": 6.12320562122697e-05,
2531
+ "loss": 0.026212453842163086,
2532
+ "mean_token_accuracy": 0.9904700636863708,
2533
+ "num_tokens": 50950152.0,
2534
+ "step": 2480
2535
+ },
2536
+ {
2537
+ "entropy": 0.9613442063331604,
2538
+ "epoch": 4.585635359116022,
2539
+ "grad_norm": 0.7168459296226501,
2540
+ "learning_rate": 6.0935097242238837e-05,
2541
+ "loss": 0.02508128583431244,
2542
+ "mean_token_accuracy": 0.9901923894882202,
2543
+ "num_tokens": 51155430.0,
2544
+ "step": 2490
2545
+ },
2546
+ {
2547
+ "entropy": 0.9571944534778595,
2548
+ "epoch": 4.6040515653775325,
2549
+ "grad_norm": 0.7590732574462891,
2550
+ "learning_rate": 6.063773261576646e-05,
2551
+ "loss": 0.025445500016212465,
2552
+ "mean_token_accuracy": 0.9902949810028077,
2553
+ "num_tokens": 51360826.0,
2554
+ "step": 2500
2555
+ },
2556
+ {
2557
+ "entropy": 0.947079461812973,
2558
+ "epoch": 4.622467771639043,
2559
+ "grad_norm": 0.6942175030708313,
2560
+ "learning_rate": 6.033997336411035e-05,
2561
+ "loss": 0.026132801175117494,
2562
+ "mean_token_accuracy": 0.9900939345359803,
2563
+ "num_tokens": 51566095.0,
2564
+ "step": 2510
2565
+ },
2566
+ {
2567
+ "entropy": 0.970003741979599,
2568
+ "epoch": 4.640883977900552,
2569
+ "grad_norm": 0.6562672257423401,
2570
+ "learning_rate": 6.00418305331675e-05,
2571
+ "loss": 0.024759869277477264,
2572
+ "mean_token_accuracy": 0.9905019223690033,
2573
+ "num_tokens": 51771177.0,
2574
+ "step": 2520
2575
+ },
2576
+ {
2577
+ "entropy": 0.9715348601341247,
2578
+ "epoch": 4.6593001841620625,
2579
+ "grad_norm": 0.6151639819145203,
2580
+ "learning_rate": 5.9743315183064564e-05,
2581
+ "loss": 0.024138522148132325,
2582
+ "mean_token_accuracy": 0.9910101473331452,
2583
+ "num_tokens": 51976349.0,
2584
+ "step": 2530
2585
+ },
2586
+ {
2587
+ "entropy": 0.9552160143852234,
2588
+ "epoch": 4.677716390423573,
2589
+ "grad_norm": 0.968815267086029,
2590
+ "learning_rate": 5.9444438387747336e-05,
2591
+ "loss": 0.027274739742279053,
2592
+ "mean_token_accuracy": 0.9896075248718261,
2593
+ "num_tokens": 52181820.0,
2594
+ "step": 2540
2595
+ },
2596
+ {
2597
+ "entropy": 0.9265012145042419,
2598
+ "epoch": 4.696132596685083,
2599
+ "grad_norm": 0.8966720700263977,
2600
+ "learning_rate": 5.914521123457015e-05,
2601
+ "loss": 0.0291823148727417,
2602
+ "mean_token_accuracy": 0.9886700630187988,
2603
+ "num_tokens": 52387511.0,
2604
+ "step": 2550
2605
+ },
2606
+ {
2607
+ "entropy": 0.9156096875667572,
2608
+ "epoch": 4.714548802946593,
2609
+ "grad_norm": 0.7747519612312317,
2610
+ "learning_rate": 5.88456448238844e-05,
2611
+ "loss": 0.02809179127216339,
2612
+ "mean_token_accuracy": 0.9891100466251374,
2613
+ "num_tokens": 52592737.0,
2614
+ "step": 2560
2615
+ },
2616
+ {
2617
+ "entropy": 0.924511456489563,
2618
+ "epoch": 4.732965009208103,
2619
+ "grad_norm": 1.0087049007415771,
2620
+ "learning_rate": 5.8545750268626844e-05,
2621
+ "loss": 0.02683232128620148,
2622
+ "mean_token_accuracy": 0.9896528899669648,
2623
+ "num_tokens": 52798814.0,
2624
+ "step": 2570
2625
+ },
2626
+ {
2627
+ "entropy": 0.9662951111793519,
2628
+ "epoch": 4.751381215469613,
2629
+ "grad_norm": 0.7709590792655945,
2630
+ "learning_rate": 5.824553869390734e-05,
2631
+ "loss": 0.02503817081451416,
2632
+ "mean_token_accuracy": 0.9900161385536194,
2633
+ "num_tokens": 53004478.0,
2634
+ "step": 2580
2635
+ },
2636
+ {
2637
+ "entropy": 0.9889141619205475,
2638
+ "epoch": 4.769797421731123,
2639
+ "grad_norm": 0.815858006477356,
2640
+ "learning_rate": 5.794502123659613e-05,
2641
+ "loss": 0.026327347755432128,
2642
+ "mean_token_accuracy": 0.9900785744190216,
2643
+ "num_tokens": 53209888.0,
2644
+ "step": 2590
2645
+ },
2646
+ {
2647
+ "entropy": 0.9785685896873474,
2648
+ "epoch": 4.788213627992634,
2649
+ "grad_norm": 0.6514431238174438,
2650
+ "learning_rate": 5.7644209044910735e-05,
2651
+ "loss": 0.025033789873123168,
2652
+ "mean_token_accuracy": 0.9902650475502014,
2653
+ "num_tokens": 53415533.0,
2654
+ "step": 2600
2655
+ },
2656
+ {
2657
+ "entropy": 0.9723869919776916,
2658
+ "epoch": 4.806629834254144,
2659
+ "grad_norm": 0.8778963685035706,
2660
+ "learning_rate": 5.7343113278002284e-05,
2661
+ "loss": 0.02379843294620514,
2662
+ "mean_token_accuracy": 0.9909472465515137,
2663
+ "num_tokens": 53620850.0,
2664
+ "step": 2610
2665
+ },
2666
+ {
2667
+ "entropy": 0.9572711050510406,
2668
+ "epoch": 4.8250460405156534,
2669
+ "grad_norm": 0.8927134871482849,
2670
+ "learning_rate": 5.70417451055417e-05,
2671
+ "loss": 0.024856947362422943,
2672
+ "mean_token_accuracy": 0.9904125213623047,
2673
+ "num_tokens": 53826259.0,
2674
+ "step": 2620
2675
+ },
2676
+ {
2677
+ "entropy": 0.9523135125637054,
2678
+ "epoch": 4.843462246777164,
2679
+ "grad_norm": 0.6832691431045532,
2680
+ "learning_rate": 5.674011570730523e-05,
2681
+ "loss": 0.025352203845977785,
2682
+ "mean_token_accuracy": 0.990432596206665,
2683
+ "num_tokens": 54031531.0,
2684
+ "step": 2630
2685
+ },
2686
+ {
2687
+ "entropy": 0.9735220730304718,
2688
+ "epoch": 4.861878453038674,
2689
+ "grad_norm": 0.6399164795875549,
2690
+ "learning_rate": 5.643823627275972e-05,
2691
+ "loss": 0.026541513204574586,
2692
+ "mean_token_accuracy": 0.9900369107723236,
2693
+ "num_tokens": 54237155.0,
2694
+ "step": 2640
2695
+ },
2696
+ {
2697
+ "entropy": 0.9566517114639282,
2698
+ "epoch": 4.880294659300184,
2699
+ "grad_norm": 0.8725414276123047,
2700
+ "learning_rate": 5.6136118000647616e-05,
2701
+ "loss": 0.02675778865814209,
2702
+ "mean_token_accuracy": 0.9894899427890778,
2703
+ "num_tokens": 54442739.0,
2704
+ "step": 2650
2705
+ },
2706
+ {
2707
+ "entropy": 0.9447909593582153,
2708
+ "epoch": 4.898710865561695,
2709
+ "grad_norm": 0.8169302344322205,
2710
+ "learning_rate": 5.583377209857138e-05,
2711
+ "loss": 0.02642086148262024,
2712
+ "mean_token_accuracy": 0.989885401725769,
2713
+ "num_tokens": 54648098.0,
2714
+ "step": 2660
2715
+ },
2716
+ {
2717
+ "entropy": 0.9180052697658538,
2718
+ "epoch": 4.917127071823204,
2719
+ "grad_norm": 0.7768753170967102,
2720
+ "learning_rate": 5.553120978257787e-05,
2721
+ "loss": 0.02552323341369629,
2722
+ "mean_token_accuracy": 0.9899512350559234,
2723
+ "num_tokens": 54854281.0,
2724
+ "step": 2670
2725
+ },
2726
+ {
2727
+ "entropy": 0.917166668176651,
2728
+ "epoch": 4.935543278084714,
2729
+ "grad_norm": 0.8241410851478577,
2730
+ "learning_rate": 5.5228442276742153e-05,
2731
+ "loss": 0.02788199484348297,
2732
+ "mean_token_accuracy": 0.989625746011734,
2733
+ "num_tokens": 55059495.0,
2734
+ "step": 2680
2735
+ },
2736
+ {
2737
+ "entropy": 0.9345465302467346,
2738
+ "epoch": 4.953959484346225,
2739
+ "grad_norm": 0.7645496129989624,
2740
+ "learning_rate": 5.4925480812751166e-05,
2741
+ "loss": 0.02517639398574829,
2742
+ "mean_token_accuracy": 0.9902283847332001,
2743
+ "num_tokens": 55265381.0,
2744
+ "step": 2690
2745
+ },
2746
+ {
2747
+ "entropy": 0.9386432528495788,
2748
+ "epoch": 4.972375690607735,
2749
+ "grad_norm": 0.8371859192848206,
2750
+ "learning_rate": 5.46223366294871e-05,
2751
+ "loss": 0.025585666298866272,
2752
+ "mean_token_accuracy": 0.9903791427612305,
2753
+ "num_tokens": 55471210.0,
2754
+ "step": 2700
2755
+ },
2756
+ {
2757
+ "entropy": 0.9267561137676239,
2758
+ "epoch": 4.990791896869245,
2759
+ "grad_norm": 0.6789297461509705,
2760
+ "learning_rate": 5.43190209726104e-05,
2761
+ "loss": 0.024646708369255067,
2762
+ "mean_token_accuracy": 0.9904700815677643,
2763
+ "num_tokens": 55676877.0,
2764
+ "step": 2710
2765
+ },
2766
+ {
2767
+ "epoch": 5.0,
2768
+ "eval_entropy": 0.9283919717954553,
2769
+ "eval_loss": 0.06225527077913284,
2770
+ "eval_mean_token_accuracy": 0.9784110421719758,
2771
+ "eval_num_tokens": 55779559.0,
2772
+ "eval_runtime": 10.0613,
2773
+ "eval_samples_per_second": 363.573,
2774
+ "eval_steps_per_second": 11.43,
2775
+ "step": 2715
2776
+ }
2777
+ ],
2778
+ "logging_steps": 10,
2779
+ "max_steps": 5430,
2780
+ "num_input_tokens_seen": 0,
2781
+ "num_train_epochs": 10,
2782
+ "save_steps": 500,
2783
+ "stateful_callbacks": {
2784
+ "TrainerControl": {
2785
+ "args": {
2786
+ "should_epoch_stop": false,
2787
+ "should_evaluate": false,
2788
+ "should_log": false,
2789
+ "should_save": true,
2790
+ "should_training_stop": false
2791
+ },
2792
+ "attributes": {}
2793
+ }
2794
+ },
2795
+ "total_flos": 2.6591019550449336e+18,
2796
+ "train_batch_size": 32,
2797
+ "trial_name": null,
2798
+ "trial_params": null
2799
+ }
checkpoint-2715/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75
3
+ size 5777
checkpoint-3258/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-3258/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-3258/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:342d90add5af5dfae9087ea9a560f86a7ebd48116022794da4d371303756af39
3
+ size 80792096
checkpoint-3258/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-3258/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-3258/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-3258/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3258/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75
3
+ size 5777
checkpoint-3801/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-3801/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 8,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "o_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "q_proj",
37
+ "gate_proj",
38
+ "k_proj",
39
+ "up_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-3801/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddf199d0ea570dc0a07cf04650028719a3b079854d9cd9c9c87302cd1e916916
3
+ size 80792096
checkpoint-3801/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-3801/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892