finalform commited on
Commit
0988882
·
verified ·
1 Parent(s): 0057556

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "down_proj",
29
- "o_proj",
30
  "k_proj",
 
 
31
  "v_proj",
32
- "gate_proj",
33
  "up_proj",
34
- "q_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
28
  "k_proj",
29
+ "o_proj",
30
+ "down_proj",
31
  "v_proj",
32
+ "q_proj",
33
  "up_proj",
34
+ "gate_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e66061757a3f9c7e694bd0869920435ba5cd7a0117a26a35caaceea1882296a8
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f8910cfd9fa931ded44c23eb7711849b0630badf75472a5a02b4c83906bce3
3
  size 645975704
chat_template.jinja CHANGED
@@ -1,48 +1,84 @@
1
  {%- if tools %}
2
  {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
  {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
  {%- for tool in tools %}
10
  {{- "\n" }}
11
  {{- tool | tojson }}
12
  {%- endfor %}
13
  {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
  {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
  {%- endif %}
20
  {%- endif %}
 
 
 
 
 
 
 
 
21
  {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
 
 
 
 
 
24
  {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
 
 
 
 
 
28
  {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
 
 
 
 
 
32
  {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
  {{- '<|im_start|>user' }}
43
  {%- endif %}
44
  {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
  {{- '\n</tool_response>' }}
47
  {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
  {{- '<|im_end|>\n' }}
@@ -51,4 +87,7 @@
51
  {%- endfor %}
52
  {%- if add_generation_prompt %}
53
  {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
1
  {%- if tools %}
2
  {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
 
 
5
  {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
  {%- for tool in tools %}
8
  {{- "\n" }}
9
  {{- tool | tojson }}
10
  {%- endfor %}
11
  {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
  {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
 
 
15
  {%- endif %}
16
  {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
  {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
  {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
  {%- endif %}
43
+
44
+ {{- '<|im_start|>' + message.role }}
45
+ {% generation %}
46
+ {%- if loop.index0 > ns.last_query_index %}
47
+ {%- if loop.last or (not loop.last and reasoning_content) %}
48
+ {{- '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
49
+ {%- else %}
50
+ {{- content }}
51
  {%- endif %}
52
+ {%- else %}
53
+ {{- content }}
54
+ {%- endif %}
55
+ {%- if message.tool_calls %}
56
+ {%- for tool_call in message.tool_calls %}
57
+ {%- if (loop.first and content) or (not loop.first) %}
58
+ {{- '\n' }}
59
+ {%- endif %}
60
+ {%- if tool_call.function %}
61
+ {%- set tool_call = tool_call.function %}
62
+ {%- endif %}
63
+ {{- '<tool_call>\n{"name": "' }}
64
+ {{- tool_call.name }}
65
+ {{- '", "arguments": ' }}
66
+ {%- if tool_call.arguments is string %}
67
+ {{- tool_call.arguments }}
68
+ {%- else %}
69
+ {{- tool_call.arguments | tojson }}
70
+ {%- endif %}
71
+ {{- '}\n</tool_call>' }}
72
+ {%- endfor %}
73
+ {%- endif %}
74
+ {{- '<|im_end|>' }}
75
+ {% endgeneration %}
76
  {%- elif message.role == "tool" %}
77
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
78
  {{- '<|im_start|>user' }}
79
  {%- endif %}
80
  {{- '\n<tool_response>\n' }}
81
+ {{- content }}
82
  {{- '\n</tool_response>' }}
83
  {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
84
  {{- '<|im_end|>\n' }}
 
87
  {%- endfor %}
88
  {%- if add_generation_prompt %}
89
  {{- '<|im_start|>assistant\n' }}
90
+ {%- if enable_thinking is defined and enable_thinking is false %}
91
+ {{- '<think>\n\n</think>\n\n' }}
92
+ {%- endif %}
93
+ {%- endif %}
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f46dc0b6cd60733e44769de8e8e0858d07c1b25a992eae50a74b529c3e4db236
3
  size 1292087499
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0503016c1f6ffd793680a057d6ac79ed7603f600c3c72fd3a91b8affb41ccb
3
  size 1292087499
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7192a40ca4014f502efc59dacc942dad83ac6b1e5dbffc44a7e2368a17abffd4
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d7ab697f09475ecdec1ff8902097d1a8197b03c13be0377e062b5c7ea1a4ffa
3
+ size 16389
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22f988a134540f1261cc919cb029bec83fd4a129faeedd432eda41941d88caca
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb8c62fa6411959132eeca62b5f51f65129ff5f237a1b58ae21b4e705cf58cc
3
+ size 16389
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a00527b6663b32e26bbe8bf4772fd7934576df924e6fc0a26b27e451253945f
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f039515b6152a470258ef1dfc9aee16b3afc5843e05bd410cc95b31eea233121
3
+ size 16389
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d158f929065935ff5e42e00f4c943d51cb4a72cc008d4cc7441b13796917f2b6
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29618a16b277174f384730a53826fe4c1ad36502092c89b8fbab81a4e1ce0a3b
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f55c62a921c1c2ecb30cf1d7e4d5a7b0a0ea17498c42bd3415091e054f66478
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:929304abfeed9a99bd0fc122eca08ef1636734606497d74870f319482cf8486a
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9d398039ec5dbf1ee72fd1cdd0a2908f9ce68e1371843f30dfb1415fbe6d1f
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd6b1af69b813d150d919a2d99343aa4b12721bbf48d4ef8242f66cc83bc5d5
3
+ size 16389
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:452277350ad8f5efad158760275ed80bf2a10a74384da69a532b63915629f8f6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88c1f19fbaac09a7b01b826d2a3eb05434d8b50a36c13d838feee781e2642515
3
  size 1465
trainer_state.json CHANGED
@@ -2,487 +2,186 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 10.0,
6
  "eval_steps": 500,
7
- "global_step": 1040,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
- {
13
- "epoch": 0.24096385542168675,
14
- "grad_norm": 0.17040389776229858,
15
- "learning_rate": 0.000511,
16
- "loss": 0.4303,
17
- "mean_token_accuracy": 0.8983636182546616,
18
- "num_tokens": 569446.0,
19
- "step": 25
20
- },
21
  {
22
  "epoch": 0.4819277108433735,
23
- "grad_norm": 0.19750550389289856,
24
- "learning_rate": 0.000511,
25
- "loss": 0.2953,
26
- "mean_token_accuracy": 0.9258013522624969,
27
- "num_tokens": 1138403.0,
28
- "step": 50
29
- },
30
- {
31
- "epoch": 0.7228915662650602,
32
- "grad_norm": 0.2441304326057434,
33
- "learning_rate": 0.000511,
34
- "loss": 0.2179,
35
- "mean_token_accuracy": 0.9427212655544281,
36
- "num_tokens": 1705606.0,
37
- "step": 75
38
  },
39
  {
40
  "epoch": 0.963855421686747,
41
- "grad_norm": 0.19773538410663605,
42
- "learning_rate": 0.000511,
43
- "loss": 0.194,
44
- "mean_token_accuracy": 0.9479873812198639,
45
- "num_tokens": 2271060.0,
46
- "step": 100
47
  },
48
  {
49
  "epoch": 1.0,
50
- "eval_loss": 0.1995953470468521,
51
- "eval_mean_token_accuracy": 0.9460804703387808,
52
- "eval_num_tokens": 2345411.0,
53
- "eval_runtime": 4.3258,
54
- "eval_samples_per_second": 85.303,
55
- "eval_steps_per_second": 10.865,
56
- "step": 104
57
- },
58
- {
59
- "epoch": 1.202409638554217,
60
- "grad_norm": 0.0789426863193512,
61
- "learning_rate": 0.000511,
62
- "loss": 0.16,
63
- "mean_token_accuracy": 0.9584937607399141,
64
- "num_tokens": 2836512.0,
65
- "step": 125
66
  },
67
  {
68
  "epoch": 1.4433734939759035,
69
- "grad_norm": 0.10643763095140457,
70
- "learning_rate": 0.000511,
71
- "loss": 0.1408,
72
- "mean_token_accuracy": 0.9620217531919479,
73
- "num_tokens": 3403395.0,
74
- "step": 150
75
- },
76
- {
77
- "epoch": 1.6843373493975904,
78
- "grad_norm": 0.08403506129980087,
79
- "learning_rate": 0.000511,
80
- "loss": 0.1314,
81
- "mean_token_accuracy": 0.9630346685647965,
82
- "num_tokens": 3970344.0,
83
- "step": 175
84
  },
85
  {
86
  "epoch": 1.9253012048192772,
87
- "grad_norm": 0.10631190985441208,
88
- "learning_rate": 0.000511,
89
- "loss": 0.1272,
90
- "mean_token_accuracy": 0.9646087974309921,
91
- "num_tokens": 4538034.0,
92
- "step": 200
93
  },
94
  {
95
  "epoch": 2.0,
96
- "eval_loss": 0.14445580542087555,
97
- "eval_mean_token_accuracy": 0.9612192435467497,
98
- "eval_num_tokens": 4690349.0,
99
- "eval_runtime": 4.2683,
100
- "eval_samples_per_second": 86.451,
101
- "eval_steps_per_second": 11.011,
102
- "step": 208
103
- },
104
- {
105
- "epoch": 2.163855421686747,
106
- "grad_norm": 0.08164115995168686,
107
- "learning_rate": 0.000511,
108
- "loss": 0.1067,
109
- "mean_token_accuracy": 0.9689010110768405,
110
- "num_tokens": 5091219.0,
111
- "step": 225
112
  },
113
  {
114
  "epoch": 2.404819277108434,
115
- "grad_norm": 0.08637778460979462,
116
- "learning_rate": 0.000511,
117
- "loss": 0.0968,
118
- "mean_token_accuracy": 0.9721185141801834,
119
- "num_tokens": 5658328.0,
120
- "step": 250
121
- },
122
- {
123
- "epoch": 2.6457831325301204,
124
- "grad_norm": 0.0902683362364769,
125
- "learning_rate": 0.000511,
126
- "loss": 0.091,
127
- "mean_token_accuracy": 0.9744718617200852,
128
- "num_tokens": 6225744.0,
129
- "step": 275
130
  },
131
  {
132
  "epoch": 2.886746987951807,
133
- "grad_norm": 0.09357521682977676,
134
- "learning_rate": 0.000511,
135
- "loss": 0.0929,
136
- "mean_token_accuracy": 0.9727102434635162,
137
- "num_tokens": 6794402.0,
138
- "step": 300
139
  },
140
  {
141
  "epoch": 3.0,
142
- "eval_loss": 0.14705069363117218,
143
- "eval_mean_token_accuracy": 0.9624257531571896,
144
- "eval_num_tokens": 7035273.0,
145
- "eval_runtime": 4.2634,
146
- "eval_samples_per_second": 86.55,
147
- "eval_steps_per_second": 11.024,
148
- "step": 312
149
- },
150
- {
151
- "epoch": 3.125301204819277,
152
- "grad_norm": 0.14760874211788177,
153
- "learning_rate": 0.000511,
154
- "loss": 0.0842,
155
- "mean_token_accuracy": 0.9764150320881545,
156
- "num_tokens": 7334495.0,
157
- "step": 325
158
  },
159
  {
160
  "epoch": 3.3662650602409636,
161
- "grad_norm": 0.09895172715187073,
162
- "learning_rate": 0.000511,
163
- "loss": 0.0777,
164
- "mean_token_accuracy": 0.9774631917476654,
165
- "num_tokens": 7903478.0,
166
- "step": 350
167
- },
168
- {
169
- "epoch": 3.6072289156626507,
170
- "grad_norm": 0.10538128763437271,
171
- "learning_rate": 0.000511,
172
- "loss": 0.0742,
173
- "mean_token_accuracy": 0.9783486902713776,
174
- "num_tokens": 8469479.0,
175
- "step": 375
176
  },
177
  {
178
  "epoch": 3.8481927710843373,
179
- "grad_norm": 0.09741026163101196,
180
- "learning_rate": 0.000511,
181
- "loss": 0.0679,
182
- "mean_token_accuracy": 0.9803410685062408,
183
- "num_tokens": 9036138.0,
184
- "step": 400
185
  },
186
  {
187
  "epoch": 4.0,
188
- "eval_loss": 0.14550796151161194,
189
- "eval_mean_token_accuracy": 0.9638357936067784,
190
- "eval_num_tokens": 9380804.0,
191
- "eval_runtime": 4.2387,
192
- "eval_samples_per_second": 87.055,
193
- "eval_steps_per_second": 11.088,
194
- "step": 416
195
- },
196
- {
197
- "epoch": 4.086746987951807,
198
- "grad_norm": 0.10616449266672134,
199
- "learning_rate": 0.000511,
200
- "loss": 0.0625,
201
- "mean_token_accuracy": 0.9811063475079007,
202
- "num_tokens": 9598059.0,
203
- "step": 425
204
  },
205
  {
206
  "epoch": 4.327710843373494,
207
- "grad_norm": 0.09445353597402573,
208
- "learning_rate": 0.000511,
209
- "loss": 0.0545,
210
- "mean_token_accuracy": 0.9833864039182663,
211
- "num_tokens": 10165089.0,
212
- "step": 450
213
- },
214
- {
215
- "epoch": 4.5686746987951805,
216
- "grad_norm": 0.07407805323600769,
217
- "learning_rate": 0.000511,
218
- "loss": 0.0545,
219
- "mean_token_accuracy": 0.984256454706192,
220
- "num_tokens": 10732931.0,
221
- "step": 475
222
  },
223
  {
224
  "epoch": 4.809638554216868,
225
- "grad_norm": 0.07322381436824799,
226
- "learning_rate": 0.000511,
227
- "loss": 0.0462,
228
- "mean_token_accuracy": 0.9861221539974213,
229
- "num_tokens": 11301466.0,
230
- "step": 500
231
  },
232
  {
233
  "epoch": 5.0,
234
- "eval_loss": 0.14242176711559296,
235
- "eval_mean_token_accuracy": 0.9691625554510888,
236
- "eval_num_tokens": 11726407.0,
237
- "eval_runtime": 4.2397,
238
- "eval_samples_per_second": 87.035,
239
- "eval_steps_per_second": 11.086,
240
- "step": 520
241
- },
242
- {
243
- "epoch": 5.048192771084337,
244
- "grad_norm": 0.06890378147363663,
245
- "learning_rate": 0.000511,
246
- "loss": 0.0538,
247
- "mean_token_accuracy": 0.9846178467827614,
248
- "num_tokens": 11856936.0,
249
- "step": 525
250
  },
251
  {
252
  "epoch": 5.289156626506024,
253
- "grad_norm": 0.05453705042600632,
254
- "learning_rate": 0.000511,
255
- "loss": 0.0485,
256
- "mean_token_accuracy": 0.9858993107080459,
257
- "num_tokens": 12423275.0,
258
- "step": 550
259
- },
260
- {
261
- "epoch": 5.530120481927711,
262
- "grad_norm": 0.0743594691157341,
263
- "learning_rate": 0.000511,
264
- "loss": 0.0455,
265
- "mean_token_accuracy": 0.9857969325780869,
266
- "num_tokens": 12992045.0,
267
- "step": 575
268
  },
269
  {
270
  "epoch": 5.771084337349397,
271
- "grad_norm": 0.06587184965610504,
272
- "learning_rate": 0.000511,
273
- "loss": 0.0446,
274
- "mean_token_accuracy": 0.9862634456157684,
275
- "num_tokens": 13560037.0,
276
- "step": 600
277
  },
278
  {
279
  "epoch": 6.0,
280
- "eval_loss": 0.1241711750626564,
281
- "eval_mean_token_accuracy": 0.9710005204728309,
282
- "eval_num_tokens": 14071932.0,
283
- "eval_runtime": 4.2228,
284
- "eval_samples_per_second": 87.383,
285
- "eval_steps_per_second": 11.13,
286
- "step": 624
287
- },
288
- {
289
- "epoch": 6.009638554216868,
290
- "grad_norm": 0.06124307960271835,
291
- "learning_rate": 0.000511,
292
- "loss": 0.0368,
293
- "mean_token_accuracy": 0.9886124525407348,
294
- "num_tokens": 14102915.0,
295
- "step": 625
296
- },
297
- {
298
- "epoch": 6.250602409638554,
299
- "grad_norm": 0.10462699830532074,
300
- "learning_rate": 0.000511,
301
- "loss": 0.0378,
302
- "mean_token_accuracy": 0.9887062352895737,
303
- "num_tokens": 14669309.0,
304
- "step": 650
305
- },
306
- {
307
- "epoch": 6.491566265060241,
308
- "grad_norm": 0.09343062341213226,
309
- "learning_rate": 0.000511,
310
- "loss": 0.0353,
311
- "mean_token_accuracy": 0.9892213380336762,
312
- "num_tokens": 15237353.0,
313
- "step": 675
314
- },
315
- {
316
- "epoch": 6.732530120481927,
317
- "grad_norm": 0.08443740010261536,
318
- "learning_rate": 0.000511,
319
- "loss": 0.0364,
320
- "mean_token_accuracy": 0.9892494148015976,
321
- "num_tokens": 15804441.0,
322
- "step": 700
323
- },
324
- {
325
- "epoch": 6.973493975903614,
326
- "grad_norm": 0.07635796070098877,
327
- "learning_rate": 0.000511,
328
- "loss": 0.0397,
329
- "mean_token_accuracy": 0.9886371964216232,
330
- "num_tokens": 16368577.0,
331
- "step": 725
332
- },
333
- {
334
- "epoch": 7.0,
335
- "eval_loss": 0.17054519057273865,
336
- "eval_mean_token_accuracy": 0.9649876089806252,
337
- "eval_num_tokens": 16416958.0,
338
- "eval_runtime": 4.2384,
339
- "eval_samples_per_second": 87.061,
340
- "eval_steps_per_second": 11.089,
341
- "step": 728
342
- },
343
- {
344
- "epoch": 7.212048192771085,
345
- "grad_norm": 0.0668734461069107,
346
- "learning_rate": 0.000511,
347
- "loss": 0.0333,
348
- "mean_token_accuracy": 0.9895464802029157,
349
- "num_tokens": 16927490.0,
350
- "step": 750
351
- },
352
- {
353
- "epoch": 7.453012048192771,
354
- "grad_norm": 0.06639474630355835,
355
- "learning_rate": 0.000511,
356
- "loss": 0.0325,
357
- "mean_token_accuracy": 0.9908391135931015,
358
- "num_tokens": 17494143.0,
359
- "step": 775
360
- },
361
- {
362
- "epoch": 7.693975903614458,
363
- "grad_norm": 0.10801058262586594,
364
- "learning_rate": 0.000511,
365
- "loss": 0.029,
366
- "mean_token_accuracy": 0.9909009468555451,
367
- "num_tokens": 18063423.0,
368
- "step": 800
369
- },
370
- {
371
- "epoch": 7.934939759036144,
372
- "grad_norm": 0.048982683569192886,
373
- "learning_rate": 0.000511,
374
- "loss": 0.0313,
375
- "mean_token_accuracy": 0.9908118671178818,
376
- "num_tokens": 18630905.0,
377
- "step": 825
378
- },
379
- {
380
- "epoch": 8.0,
381
- "eval_loss": 0.1113305315375328,
382
- "eval_mean_token_accuracy": 0.9737493814306056,
383
- "eval_num_tokens": 18762023.0,
384
- "eval_runtime": 4.2275,
385
- "eval_samples_per_second": 87.285,
386
- "eval_steps_per_second": 11.118,
387
- "step": 832
388
- },
389
- {
390
- "epoch": 8.173493975903614,
391
- "grad_norm": 0.05220003426074982,
392
- "learning_rate": 0.000511,
393
- "loss": 0.0277,
394
- "mean_token_accuracy": 0.9916808304160533,
395
- "num_tokens": 19184427.0,
396
- "step": 850
397
- },
398
- {
399
- "epoch": 8.4144578313253,
400
- "grad_norm": 0.08605129271745682,
401
- "learning_rate": 0.000511,
402
- "loss": 0.0273,
403
- "mean_token_accuracy": 0.9915985196828843,
404
- "num_tokens": 19754417.0,
405
- "step": 875
406
- },
407
- {
408
- "epoch": 8.655421686746989,
409
- "grad_norm": 0.0517394132912159,
410
- "learning_rate": 0.000511,
411
- "loss": 0.0258,
412
- "mean_token_accuracy": 0.9922689855098724,
413
- "num_tokens": 20322925.0,
414
- "step": 900
415
- },
416
- {
417
- "epoch": 8.896385542168675,
418
- "grad_norm": 0.059128183871507645,
419
- "learning_rate": 0.000511,
420
- "loss": 0.0256,
421
- "mean_token_accuracy": 0.9923817366361618,
422
- "num_tokens": 20889889.0,
423
- "step": 925
424
- },
425
- {
426
- "epoch": 9.0,
427
- "eval_loss": 0.1412837952375412,
428
- "eval_mean_token_accuracy": 0.97177672893443,
429
- "eval_num_tokens": 21107006.0,
430
- "eval_runtime": 4.2426,
431
- "eval_samples_per_second": 86.974,
432
- "eval_steps_per_second": 11.078,
433
- "step": 936
434
- },
435
- {
436
- "epoch": 9.134939759036145,
437
- "grad_norm": 0.08735097944736481,
438
- "learning_rate": 0.000511,
439
- "loss": 0.0214,
440
- "mean_token_accuracy": 0.9934688914905895,
441
- "num_tokens": 21433768.0,
442
- "step": 950
443
- },
444
- {
445
- "epoch": 9.375903614457831,
446
- "grad_norm": 0.07706229388713837,
447
- "learning_rate": 0.000511,
448
- "loss": 0.0219,
449
- "mean_token_accuracy": 0.9933523815870285,
450
- "num_tokens": 22001855.0,
451
- "step": 975
452
- },
453
- {
454
- "epoch": 9.616867469879518,
455
- "grad_norm": 0.0835743248462677,
456
- "learning_rate": 0.000511,
457
- "loss": 0.0202,
458
- "mean_token_accuracy": 0.9938382267951965,
459
- "num_tokens": 22568226.0,
460
- "step": 1000
461
- },
462
- {
463
- "epoch": 9.857831325301206,
464
- "grad_norm": 0.10814040899276733,
465
- "learning_rate": 0.000511,
466
- "loss": 0.0222,
467
- "mean_token_accuracy": 0.99358702480793,
468
- "num_tokens": 23136558.0,
469
- "step": 1025
470
- },
471
- {
472
- "epoch": 10.0,
473
- "eval_loss": 0.1364360749721527,
474
- "eval_mean_token_accuracy": 0.9739730041077796,
475
- "eval_num_tokens": 23451914.0,
476
- "eval_runtime": 4.2514,
477
- "eval_samples_per_second": 86.796,
478
- "eval_steps_per_second": 11.055,
479
- "step": 1040
480
  }
481
  ],
482
  "logging_steps": 25,
483
- "max_steps": 1248,
484
  "num_input_tokens_seen": 0,
485
- "num_train_epochs": 12,
486
  "save_steps": 500,
487
  "stateful_callbacks": {
488
  "TrainerControl": {
@@ -496,8 +195,8 @@
496
  "attributes": {}
497
  }
498
  },
499
- "total_flos": 1.0255354758734807e+18,
500
- "train_batch_size": 2,
501
  "trial_name": null,
502
  "trial_params": null
503
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.0,
6
  "eval_steps": 500,
7
+ "global_step": 312,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
 
 
 
 
 
 
 
 
 
12
  {
13
  "epoch": 0.4819277108433735,
14
+ "grad_norm": 0.04972561076283455,
15
+ "learning_rate": 0.000509291899366086,
16
+ "loss": 0.2396,
17
+ "mean_token_accuracy": 0.9078072667121887,
18
+ "num_tokens": 10242232.0,
19
+ "step": 25
 
 
 
 
 
 
 
 
 
20
  },
21
  {
22
  "epoch": 0.963855421686747,
23
+ "grad_norm": 0.04746510088443756,
24
+ "learning_rate": 0.0004965277770447238,
25
+ "loss": 0.1033,
26
+ "mean_token_accuracy": 0.9155593666434289,
27
+ "num_tokens": 20493886.0,
28
+ "step": 50
29
  },
30
  {
31
  "epoch": 1.0,
32
+ "eval_loss": NaN,
33
+ "eval_mean_token_accuracy": 0.9735881001391309,
34
+ "eval_num_tokens": 21054178.0,
35
+ "eval_runtime": 41.8061,
36
+ "eval_samples_per_second": 8.826,
37
+ "eval_steps_per_second": 1.124,
38
+ "step": 52
 
 
 
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 1.4433734939759035,
42
+ "grad_norm": 0.048190850764513016,
43
+ "learning_rate": 0.00047188122815287187,
44
+ "loss": 0.0796,
45
+ "mean_token_accuracy": 0.9210376496890083,
46
+ "num_tokens": 30877786.0,
47
+ "step": 75
 
 
 
 
 
 
 
 
 
48
  },
49
  {
50
  "epoch": 1.9253012048192772,
51
+ "grad_norm": 0.07209828495979309,
52
+ "learning_rate": 0.0004365673027192623,
53
+ "loss": 0.063,
54
+ "mean_token_accuracy": 0.9394274836778641,
55
+ "num_tokens": 41107523.0,
56
+ "step": 100
57
  },
58
  {
59
  "epoch": 2.0,
60
+ "eval_loss": NaN,
61
+ "eval_mean_token_accuracy": 0.9802652305745064,
62
+ "eval_num_tokens": 42118832.0,
63
+ "eval_runtime": 43.7261,
64
+ "eval_samples_per_second": 8.439,
65
+ "eval_steps_per_second": 1.075,
66
+ "step": 104
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
  "epoch": 2.404819277108434,
70
+ "grad_norm": 0.05603017657995224,
71
+ "learning_rate": 0.00039232694168865086,
72
+ "loss": 0.0545,
73
+ "mean_token_accuracy": 0.9359195819452181,
74
+ "num_tokens": 51238167.0,
75
+ "step": 125
 
 
 
 
 
 
 
 
 
76
  },
77
  {
78
  "epoch": 2.886746987951807,
79
+ "grad_norm": 0.07570821046829224,
80
+ "learning_rate": 0.00034134115028725524,
81
+ "loss": 0.0391,
82
+ "mean_token_accuracy": 0.9296775516867638,
83
+ "num_tokens": 61475716.0,
84
+ "step": 150
85
  },
86
  {
87
  "epoch": 3.0,
88
+ "eval_loss": NaN,
89
+ "eval_mean_token_accuracy": 0.9825496534083752,
90
+ "eval_num_tokens": 63186871.0,
91
+ "eval_runtime": 42.5381,
92
+ "eval_samples_per_second": 8.675,
93
+ "eval_steps_per_second": 1.105,
94
+ "step": 156
 
 
 
 
 
 
 
 
 
95
  },
96
  {
97
  "epoch": 3.3662650602409636,
98
+ "grad_norm": 0.07343257963657379,
99
+ "learning_rate": 0.0002861234766624867,
100
+ "loss": 0.0345,
101
+ "mean_token_accuracy": 0.9456785363168573,
102
+ "num_tokens": 71027426.0,
103
+ "step": 175
 
 
 
 
 
 
 
 
 
104
  },
105
  {
106
  "epoch": 3.8481927710843373,
107
+ "grad_norm": 0.04622579738497734,
108
+ "learning_rate": 0.0002293960964917063,
109
+ "loss": 0.0259,
110
+ "mean_token_accuracy": 0.9330729904770851,
111
+ "num_tokens": 81341930.0,
112
+ "step": 200
113
  },
114
  {
115
  "epoch": 4.0,
116
+ "eval_loss": NaN,
117
+ "eval_mean_token_accuracy": 0.9852228583173549,
118
+ "eval_num_tokens": 84250831.0,
119
+ "eval_runtime": 41.6702,
120
+ "eval_samples_per_second": 8.855,
121
+ "eval_steps_per_second": 1.128,
122
+ "step": 208
 
 
 
 
 
 
 
 
 
123
  },
124
  {
125
  "epoch": 4.327710843373494,
126
+ "grad_norm": 0.04485568404197693,
127
+ "learning_rate": 0.0001739556124639496,
128
+ "loss": 0.0208,
129
+ "mean_token_accuracy": 0.9436070158253962,
130
+ "num_tokens": 91589095.0,
131
+ "step": 225
 
 
 
 
 
 
 
 
 
132
  },
133
  {
134
  "epoch": 4.809638554216868,
135
+ "grad_norm": 0.03364564850926399,
136
+ "learning_rate": 0.00012253518458496144,
137
+ "loss": 0.0185,
138
+ "mean_token_accuracy": 0.9403582978248596,
139
+ "num_tokens": 101826338.0,
140
+ "step": 250
141
  },
142
  {
143
  "epoch": 5.0,
144
+ "eval_loss": NaN,
145
+ "eval_mean_token_accuracy": 0.9858830913584283,
146
+ "eval_num_tokens": 105323752.0,
147
+ "eval_runtime": 41.5429,
148
+ "eval_samples_per_second": 8.882,
149
+ "eval_steps_per_second": 1.131,
150
+ "step": 260
 
 
 
 
 
 
 
 
 
151
  },
152
  {
153
  "epoch": 5.289156626506024,
154
+ "grad_norm": 0.04389314353466034,
155
+ "learning_rate": 7.766978814259806e-05,
156
+ "loss": 0.0122,
157
+ "mean_token_accuracy": 0.9414373160007611,
158
+ "num_tokens": 111938446.0,
159
+ "step": 275
 
 
 
 
 
 
 
 
 
160
  },
161
  {
162
  "epoch": 5.771084337349397,
163
+ "grad_norm": 0.03338490426540375,
164
+ "learning_rate": 4.1571241979147114e-05,
165
+ "loss": 0.0094,
166
+ "mean_token_accuracy": 0.9374336645007133,
167
+ "num_tokens": 122190732.0,
168
+ "step": 300
169
  },
170
  {
171
  "epoch": 6.0,
172
+ "eval_loss": NaN,
173
+ "eval_mean_token_accuracy": 0.983714316753631,
174
+ "eval_num_tokens": 126386752.0,
175
+ "eval_runtime": 42.0902,
176
+ "eval_samples_per_second": 8.767,
177
+ "eval_steps_per_second": 1.117,
178
+ "step": 312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  }
180
  ],
181
  "logging_steps": 25,
182
+ "max_steps": 364,
183
  "num_input_tokens_seen": 0,
184
+ "num_train_epochs": 7,
185
  "save_steps": 500,
186
  "stateful_callbacks": {
187
  "TrainerControl": {
 
195
  "attributes": {}
196
  }
197
  },
198
+ "total_flos": 5.549416727210623e+18,
199
+ "train_batch_size": 1,
200
  "trial_name": null,
201
  "trial_params": null
202
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9abd2e560826c250d1fd3df1ec10e14fef0b5ef3b175f43df982f2fc9f2f11d9
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ffd5f1c6a0b427299412ba1ef18436e0c585b3ee0c4bf80b6deb44a24dfc18
3
  size 6097