finalform commited on
Commit
6d1d04a
·
verified ·
1 Parent(s): 4b34842

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. adapter_config.json +2 -2
  3. adapter_model.safetensors +1 -1
  4. checkpoint-1245/adapter_config.json +2 -2
  5. checkpoint-1245/adapter_model.safetensors +1 -1
  6. checkpoint-1245/optimizer.pt +1 -1
  7. checkpoint-1245/rng_state.pth +1 -1
  8. checkpoint-1245/scheduler.pt +1 -1
  9. checkpoint-1245/trainer_state.json +279 -249
  10. checkpoint-1245/training_args.bin +1 -1
  11. checkpoint-1660/README.md +202 -0
  12. checkpoint-1660/adapter_config.json +39 -0
  13. checkpoint-1660/adapter_model.safetensors +3 -0
  14. checkpoint-1660/added_tokens.json +28 -0
  15. checkpoint-1660/chat_template.jinja +89 -0
  16. checkpoint-1660/merges.txt +0 -0
  17. checkpoint-1660/optimizer.pt +3 -0
  18. checkpoint-1660/rng_state.pth +3 -0
  19. checkpoint-1660/scheduler.pt +3 -0
  20. checkpoint-1660/special_tokens_map.json +25 -0
  21. checkpoint-1660/tokenizer.json +3 -0
  22. checkpoint-1660/tokenizer_config.json +239 -0
  23. checkpoint-1660/trainer_state.json +668 -0
  24. checkpoint-1660/training_args.bin +3 -0
  25. checkpoint-1660/vocab.json +0 -0
  26. checkpoint-415/README.md +202 -0
  27. checkpoint-415/adapter_config.json +39 -0
  28. checkpoint-415/adapter_model.safetensors +3 -0
  29. checkpoint-415/added_tokens.json +28 -0
  30. checkpoint-415/chat_template.jinja +89 -0
  31. checkpoint-415/merges.txt +0 -0
  32. checkpoint-415/optimizer.pt +3 -0
  33. checkpoint-415/rng_state.pth +3 -0
  34. checkpoint-415/scheduler.pt +3 -0
  35. checkpoint-415/special_tokens_map.json +25 -0
  36. checkpoint-415/tokenizer.json +3 -0
  37. checkpoint-415/tokenizer_config.json +239 -0
  38. checkpoint-415/trainer_state.json +188 -0
  39. checkpoint-415/training_args.bin +3 -0
  40. checkpoint-415/vocab.json +0 -0
  41. checkpoint-830/README.md +202 -0
  42. checkpoint-830/adapter_config.json +39 -0
  43. checkpoint-830/adapter_model.safetensors +3 -0
  44. checkpoint-830/added_tokens.json +28 -0
  45. checkpoint-830/chat_template.jinja +89 -0
  46. checkpoint-830/merges.txt +0 -0
  47. checkpoint-830/optimizer.pt +3 -0
  48. checkpoint-830/rng_state.pth +3 -0
  49. checkpoint-830/scheduler.pt +3 -0
  50. checkpoint-830/special_tokens_map.json +25 -0
.gitattributes CHANGED
@@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  checkpoint-1245/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
36
  checkpoint-1245/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-1660/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-415/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-830/tokenizer.json filter=lfs diff=lfs merge=lfs -text
adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "k_proj",
 
28
  "gate_proj",
29
  "o_proj",
30
  "down_proj",
31
- "v_proj",
32
- "q_proj",
33
  "up_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
+ "q_proj",
28
  "k_proj",
29
+ "v_proj",
30
  "gate_proj",
31
  "o_proj",
32
  "down_proj",
 
 
33
  "up_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03224c5e80ca77378b2d951c7e354983e452793b01311aca070149ab1b3752f6
3
  size 349243752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea2ad39cdd86eff3c4df726c9ae56cab758f941a5339c78780151d5f982614d6
3
  size 349243752
checkpoint-1245/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "k_proj",
 
28
  "gate_proj",
29
  "o_proj",
30
  "down_proj",
31
- "v_proj",
32
- "q_proj",
33
  "up_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
+ "q_proj",
28
  "k_proj",
29
+ "v_proj",
30
  "gate_proj",
31
  "o_proj",
32
  "down_proj",
 
 
33
  "up_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
checkpoint-1245/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03224c5e80ca77378b2d951c7e354983e452793b01311aca070149ab1b3752f6
3
  size 349243752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debbf9c7f4978f933f45469be929af12fd6873eb796fb1cfede2ddf133a31247
3
  size 349243752
checkpoint-1245/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c06811ab2521eb093525da13aff2b31069d46b9db08099b1a092ff3122a2495e
3
  size 698662547
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4bc260a1513be6d302f69fb80d16cd2d6a1bb041d6cd844862be1de3616901c
3
  size 698662547
checkpoint-1245/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca940159064321c03f2cd2c848b8308bc995f510752a53bc4b8a8f27aa7b6e76
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e12287504ff9b057ed520c310df9c42c835bf6f37da9431e263304d2dd53349e
3
  size 14645
checkpoint-1245/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82fcd00123c3c69d4e7b09d8e96247aa0926ecb0862a7624726a4095234b5d76
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:193574996f600c28e8b47f45151d73bb7e621e2e2ca63a97be22e8a15e926943
3
  size 1465
checkpoint-1245/trainer_state.json CHANGED
@@ -11,451 +11,481 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
- "grad_norm": 0.2726256847381592,
15
- "learning_rate": 0.0001894736842105263,
16
- "loss": 1.7821,
17
- "mean_token_accuracy": 0.634402088522911,
18
- "num_tokens": 157892.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
- "grad_norm": 0.27001282572746277,
24
- "learning_rate": 0.00029993852448555923,
25
- "loss": 0.8134,
26
- "mean_token_accuracy": 0.7912577825784683,
27
- "num_tokens": 284188.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
- "grad_norm": 0.23073740303516388,
33
- "learning_rate": 0.00029934198818572623,
34
- "loss": 0.603,
35
- "mean_token_accuracy": 0.8322482287883759,
36
- "num_tokens": 446681.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
- "grad_norm": 0.28391265869140625,
42
- "learning_rate": 0.0002981133400718627,
43
- "loss": 0.4865,
44
- "mean_token_accuracy": 0.8649927872419357,
45
- "num_tokens": 575282.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
- "grad_norm": 0.24657945334911346,
51
- "learning_rate": 0.0002962577805768642,
52
- "loss": 0.3658,
53
- "mean_token_accuracy": 0.8938217234611511,
54
- "num_tokens": 735901.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
- "grad_norm": 0.33079952001571655,
60
- "learning_rate": 0.00029378316362776546,
61
- "loss": 0.3273,
62
- "mean_token_accuracy": 0.9049834907054901,
63
- "num_tokens": 864618.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
- "grad_norm": 0.2289544939994812,
69
- "learning_rate": 0.0002906999634028451,
70
- "loss": 0.2503,
71
- "mean_token_accuracy": 0.9277959632873535,
72
- "num_tokens": 1024868.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
- "grad_norm": 0.44999751448631287,
78
- "learning_rate": 0.0002870212299981334,
79
- "loss": 0.2345,
80
- "mean_token_accuracy": 0.9310170763731003,
81
- "num_tokens": 1152961.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
- "grad_norm": 0.2300587147474289,
87
- "learning_rate": 0.00028276253419097193,
88
- "loss": 0.1834,
89
- "mean_token_accuracy": 0.9455738461017609,
90
- "num_tokens": 1310452.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
- "grad_norm": 0.4270714521408081,
96
- "learning_rate": 0.00027794190153442033,
97
- "loss": 0.1734,
98
- "mean_token_accuracy": 0.9492023700475692,
99
- "num_tokens": 1438013.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
- "grad_norm": 0.17301490902900696,
105
- "learning_rate": 0.00027257973606146575,
106
- "loss": 0.1234,
107
- "mean_token_accuracy": 0.9651462835073471,
108
- "num_tokens": 1593568.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
- "grad_norm": 0.42916586995124817,
114
- "learning_rate": 0.0002666987339219681,
115
- "loss": 0.1336,
116
- "mean_token_accuracy": 0.9616361856460571,
117
- "num_tokens": 1721008.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
- "grad_norm": 0.09723316133022308,
123
- "learning_rate": 0.0002603237873178853,
124
- "loss": 0.1095,
125
- "mean_token_accuracy": 0.968398722410202,
126
- "num_tokens": 1877612.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
- "grad_norm": 0.29951852560043335,
132
- "learning_rate": 0.0002534818791433866,
133
- "loss": 0.0958,
134
- "mean_token_accuracy": 0.9727602601051331,
135
- "num_tokens": 2003095.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
- "grad_norm": 0.19163310527801514,
141
- "learning_rate": 0.00024620196877580576,
142
- "loss": 0.094,
143
- "mean_token_accuracy": 0.973841313123703,
144
- "num_tokens": 2159917.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
- "grad_norm": 0.40213829278945923,
150
- "learning_rate": 0.00023851486950083892,
151
- "loss": 0.0911,
152
- "mean_token_accuracy": 0.97495201587677,
153
- "num_tokens": 2287879.0,
154
  "step": 400
155
  },
 
 
 
 
 
 
 
 
 
 
156
  {
157
  "epoch": 1.024140012070006,
158
- "grad_norm": 0.1628786027431488,
159
- "learning_rate": 0.00023045311809080567,
160
- "loss": 0.0952,
161
- "mean_token_accuracy": 0.9737161440947621,
162
- "num_tokens": 2438228.0,
163
  "step": 425
164
  },
165
  {
166
  "epoch": 1.0844900422450212,
167
- "grad_norm": 0.17094825208187103,
168
- "learning_rate": 0.00022205083708799942,
169
- "loss": 0.0561,
170
- "mean_token_accuracy": 0.9833515232801437,
171
- "num_tokens": 2583293.0,
172
  "step": 450
173
  },
174
  {
175
  "epoch": 1.1448400724200363,
176
- "grad_norm": 0.1984417885541916,
177
- "learning_rate": 0.0002133435903760353,
178
- "loss": 0.0719,
179
- "mean_token_accuracy": 0.9798995298147202,
180
- "num_tokens": 2724870.0,
181
  "step": 475
182
  },
183
  {
184
  "epoch": 1.2051901025950513,
185
- "grad_norm": 0.1823195070028305,
186
- "learning_rate": 0.0002043682326505094,
187
- "loss": 0.0466,
188
- "mean_token_accuracy": 0.9866354477405548,
189
- "num_tokens": 2868460.0,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 1.2655401327700664,
194
- "grad_norm": 0.14491313695907593,
195
- "learning_rate": 0.000195162753426108,
196
- "loss": 0.0701,
197
- "mean_token_accuracy": 0.9807595479488372,
198
- "num_tokens": 3011315.0,
199
  "step": 525
200
  },
201
  {
202
  "epoch": 1.3258901629450814,
203
- "grad_norm": 0.14561036229133606,
204
- "learning_rate": 0.00018576611624042852,
205
- "loss": 0.0493,
206
- "mean_token_accuracy": 0.9858711469173431,
207
- "num_tokens": 3153318.0,
208
  "step": 550
209
  },
210
  {
211
  "epoch": 1.3862401931200965,
212
- "grad_norm": 0.08111721277236938,
213
- "learning_rate": 0.00017621809373510641,
214
- "loss": 0.0621,
215
- "mean_token_accuracy": 0.9823315119743348,
216
- "num_tokens": 3296073.0,
217
  "step": 575
218
  },
219
  {
220
  "epoch": 1.4465902232951118,
221
- "grad_norm": 0.0793062075972557,
222
- "learning_rate": 0.00016655909931229048,
223
- "loss": 0.0472,
224
- "mean_token_accuracy": 0.986634315252304,
225
- "num_tokens": 3440522.0,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 1.5069402534701268,
230
- "grad_norm": 0.09899070113897324,
231
- "learning_rate": 0.00015683001607900553,
232
- "loss": 0.0507,
233
- "mean_token_accuracy": 0.9856790328025817,
234
- "num_tokens": 3582759.0,
235
  "step": 625
236
  },
237
  {
238
  "epoch": 1.567290283645142,
239
- "grad_norm": 0.10104668885469437,
240
- "learning_rate": 0.00014707202380342108,
241
- "loss": 0.045,
242
- "mean_token_accuracy": 0.9872064375877381,
243
- "num_tokens": 3724646.0,
244
  "step": 650
245
  },
246
  {
247
  "epoch": 1.627640313820157,
248
- "grad_norm": 0.12305350601673126,
249
- "learning_rate": 0.00013732642461545747,
250
- "loss": 0.061,
251
- "mean_token_accuracy": 0.9832958990335464,
252
- "num_tokens": 3867116.0,
253
  "step": 675
254
  },
255
  {
256
  "epoch": 1.687990343995172,
257
- "grad_norm": 0.07760481536388397,
258
- "learning_rate": 0.00012763446818947865,
259
- "loss": 0.039,
260
- "mean_token_accuracy": 0.9887771773338317,
261
- "num_tokens": 4010711.0,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 1.748340374170187,
266
- "grad_norm": 0.07849643379449844,
267
- "learning_rate": 0.00011803717714901029,
268
- "loss": 0.0611,
269
- "mean_token_accuracy": 0.9825647151470185,
270
- "num_tokens": 4153743.0,
271
  "step": 725
272
  },
273
  {
274
  "epoch": 1.8086904043452021,
275
- "grad_norm": 0.07204271852970123,
276
- "learning_rate": 0.00010857517343248423,
277
- "loss": 0.0371,
278
- "mean_token_accuracy": 0.9890217131376267,
279
- "num_tokens": 4297676.0,
280
  "step": 750
281
  },
282
  {
283
  "epoch": 1.8690404345202172,
284
- "grad_norm": 0.08970830589532852,
285
- "learning_rate": 9.9288506354941e-05,
286
- "loss": 0.0518,
287
- "mean_token_accuracy": 0.9855406028032303,
288
- "num_tokens": 4439199.0,
289
  "step": 775
290
  },
291
  {
292
  "epoch": 1.9293904646952322,
293
- "grad_norm": 0.07621912658214569,
294
- "learning_rate": 9.021648309344443e-05,
295
- "loss": 0.0354,
296
- "mean_token_accuracy": 0.9890959084033966,
297
- "num_tokens": 4581949.0,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 1.9897404948702473,
302
- "grad_norm": 0.07033903896808624,
303
- "learning_rate": 8.139750231370407e-05,
304
- "loss": 0.0416,
305
- "mean_token_accuracy": 0.9879856097698212,
306
- "num_tokens": 4714427.0,
307
  "step": 825
308
  },
 
 
 
 
 
 
 
 
 
 
309
  {
310
  "epoch": 2.048280024140012,
311
- "grad_norm": 0.07433084398508072,
312
- "learning_rate": 7.28688916421049e-05,
313
- "loss": 0.048,
314
- "mean_token_accuracy": 0.9853949061374074,
315
- "num_tokens": 4865666.0,
316
  "step": 850
317
  },
318
  {
319
  "epoch": 2.1086300543150274,
320
- "grad_norm": 0.06543659418821335,
321
- "learning_rate": 6.466674967106751e-05,
322
- "loss": 0.0291,
323
- "mean_token_accuracy": 0.9913258212804794,
324
- "num_tokens": 4998878.0,
325
  "step": 875
326
  },
327
  {
328
  "epoch": 2.1689800844900424,
329
- "grad_norm": 0.06879922747612,
330
- "learning_rate": 5.682579316647408e-05,
331
- "loss": 0.039,
332
- "mean_token_accuracy": 0.9881585425138474,
333
- "num_tokens": 5150646.0,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 2.2293301146650575,
338
- "grad_norm": 0.06347832828760147,
339
- "learning_rate": 4.937921012387816e-05,
340
- "loss": 0.0294,
341
- "mean_token_accuracy": 0.9909294676780701,
342
- "num_tokens": 5285687.0,
343
  "step": 925
344
  },
345
  {
346
  "epoch": 2.2896801448400725,
347
- "grad_norm": 0.08423357456922531,
348
- "learning_rate": 4.235851929545771e-05,
349
- "loss": 0.0379,
350
- "mean_token_accuracy": 0.9882619392871856,
351
- "num_tokens": 5437604.0,
352
  "step": 950
353
  },
354
  {
355
  "epoch": 2.3500301750150876,
356
- "grad_norm": 0.06368061155080795,
357
- "learning_rate": 3.579343678228525e-05,
358
- "loss": 0.0292,
359
- "mean_token_accuracy": 0.9909913945198059,
360
- "num_tokens": 5571761.0,
361
  "step": 975
362
  },
363
  {
364
  "epoch": 2.4103802051901027,
365
- "grad_norm": 0.061880286782979965,
366
- "learning_rate": 2.9711750256582538e-05,
367
  "loss": 0.0403,
368
- "mean_token_accuracy": 0.9878204268217087,
369
- "num_tokens": 5724871.0,
370
  "step": 1000
371
  },
372
  {
373
  "epoch": 2.4707302353651177,
374
- "grad_norm": 0.060907330363988876,
375
- "learning_rate": 2.413920134633272e-05,
376
- "loss": 0.0269,
377
- "mean_token_accuracy": 0.9914020735025406,
378
- "num_tokens": 5859593.0,
379
  "step": 1025
380
  },
381
  {
382
  "epoch": 2.5310802655401328,
383
- "grad_norm": 0.07528848201036453,
384
- "learning_rate": 1.909937668007352e-05,
385
- "loss": 0.0396,
386
- "mean_token_accuracy": 0.9879765379428863,
387
- "num_tokens": 6012549.0,
388
  "step": 1050
389
  },
390
  {
391
  "epoch": 2.591430295715148,
392
- "grad_norm": 0.07129911333322525,
393
- "learning_rate": 1.461360805304146e-05,
394
- "loss": 0.0252,
395
- "mean_token_accuracy": 0.9917944890260696,
396
- "num_tokens": 6146101.0,
397
  "step": 1075
398
  },
399
  {
400
  "epoch": 2.651780325890163,
401
- "grad_norm": 0.055356480181217194,
402
- "learning_rate": 1.0700882137227434e-05,
403
- "loss": 0.0457,
404
- "mean_token_accuracy": 0.9865469449758529,
405
- "num_tokens": 6298782.0,
406
  "step": 1100
407
  },
408
  {
409
  "epoch": 2.712130356065178,
410
- "grad_norm": 0.0652829185128212,
411
- "learning_rate": 7.377760117509834e-06,
412
- "loss": 0.0251,
413
- "mean_token_accuracy": 0.9921035206317902,
414
- "num_tokens": 6430918.0,
415
  "step": 1125
416
  },
417
  {
418
  "epoch": 2.772480386240193,
419
- "grad_norm": 0.05269525945186615,
420
- "learning_rate": 4.65830759401658e-06,
421
- "loss": 0.0382,
422
- "mean_token_accuracy": 0.9884834003448486,
423
- "num_tokens": 6580407.0,
424
  "step": 1150
425
  },
426
  {
427
  "epoch": 2.832830416415208,
428
- "grad_norm": 0.07319523394107819,
429
- "learning_rate": 2.554035047414732e-06,
430
- "loss": 0.0244,
431
- "mean_token_accuracy": 0.9923358470201492,
432
- "num_tokens": 6712568.0,
433
  "step": 1175
434
  },
435
  {
436
  "epoch": 2.8931804465902236,
437
- "grad_norm": 0.07114146649837494,
438
- "learning_rate": 1.0738491191171372e-06,
439
- "loss": 0.0404,
440
- "mean_token_accuracy": 0.9876986140012741,
441
- "num_tokens": 6865440.0,
442
  "step": 1200
443
  },
444
  {
445
  "epoch": 2.9535304767652386,
446
- "grad_norm": 0.05154518783092499,
447
- "learning_rate": 2.2401491261947456e-07,
448
- "loss": 0.0261,
449
- "mean_token_accuracy": 0.9918761789798737,
450
- "num_tokens": 7000309.0,
451
  "step": 1225
 
 
 
 
 
 
 
 
 
 
452
  }
453
  ],
454
  "logging_steps": 25,
455
- "max_steps": 1245,
456
  "num_input_tokens_seen": 0,
457
- "num_train_epochs": 3,
458
- "save_steps": 750,
459
  "stateful_callbacks": {
460
  "TrainerControl": {
461
  "args": {
@@ -463,12 +493,12 @@
463
  "should_evaluate": false,
464
  "should_log": false,
465
  "should_save": true,
466
- "should_training_stop": true
467
  },
468
  "attributes": {}
469
  }
470
  },
471
- "total_flos": 3.26612671378901e+17,
472
  "train_batch_size": 2,
473
  "trial_name": null,
474
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.3188655376434326,
15
+ "learning_rate": 0.00014399999999999998,
16
+ "loss": 1.86,
17
+ "mean_token_accuracy": 0.6245196205377579,
18
+ "num_tokens": 157541.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.2658841907978058,
24
+ "learning_rate": 0.000294,
25
+ "loss": 0.8983,
26
+ "mean_token_accuracy": 0.7745399290323257,
27
+ "num_tokens": 284307.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.24192510545253754,
33
+ "learning_rate": 0.00029983554299928354,
34
+ "loss": 0.6083,
35
+ "mean_token_accuracy": 0.8323455977439881,
36
+ "num_tokens": 440617.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.3552621006965637,
42
+ "learning_rate": 0.00029931487386844626,
43
+ "loss": 0.5333,
44
+ "mean_token_accuracy": 0.8501472049951553,
45
+ "num_tokens": 567051.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.29719677567481995,
51
+ "learning_rate": 0.000298438945846945,
52
+ "loss": 0.4156,
53
+ "mean_token_accuracy": 0.8798932474851608,
54
+ "num_tokens": 724100.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.30843958258628845,
60
+ "learning_rate": 0.0002972098429951895,
61
+ "loss": 0.3274,
62
+ "mean_token_accuracy": 0.9020548111200333,
63
+ "num_tokens": 852073.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.3006002604961395,
69
+ "learning_rate": 0.0002956304896682979,
70
+ "loss": 0.2719,
71
+ "mean_token_accuracy": 0.9195013505220413,
72
+ "num_tokens": 1012131.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.39749929308891296,
78
+ "learning_rate": 0.00029370464355829616,
79
+ "loss": 0.2304,
80
+ "mean_token_accuracy": 0.9330711585283279,
81
+ "num_tokens": 1138475.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.21084098517894745,
87
+ "learning_rate": 0.00029143688675359184,
88
+ "loss": 0.1776,
89
+ "mean_token_accuracy": 0.9493161207437515,
90
+ "num_tokens": 1298548.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.2666475772857666,
96
+ "learning_rate": 0.000288832614836995,
97
+ "loss": 0.1483,
98
+ "mean_token_accuracy": 0.9569978493452073,
99
+ "num_tokens": 1428047.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
+ "grad_norm": 0.291533887386322,
105
+ "learning_rate": 0.00028589802404822455,
106
+ "loss": 0.1563,
107
+ "mean_token_accuracy": 0.9560109853744507,
108
+ "num_tokens": 1586758.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
+ "grad_norm": 0.3723059892654419,
114
+ "learning_rate": 0.0002826400965414433,
115
+ "loss": 0.1303,
116
+ "mean_token_accuracy": 0.9621474850177765,
117
+ "num_tokens": 1714018.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.39150121808052063,
123
+ "learning_rate": 0.00027906658377289907,
124
+ "loss": 0.115,
125
+ "mean_token_accuracy": 0.9675602000951767,
126
+ "num_tokens": 1872256.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.3891217112541199,
132
+ "learning_rate": 0.0002751859880581954,
133
+ "loss": 0.1052,
134
+ "mean_token_accuracy": 0.9699741625785827,
135
+ "num_tokens": 1999407.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.15646834671497345,
141
+ "learning_rate": 0.00027100754234307293,
142
+ "loss": 0.095,
143
+ "mean_token_accuracy": 0.9727736663818359,
144
+ "num_tokens": 2159407.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.3074830174446106,
150
+ "learning_rate": 0.00026654118823583243,
151
+ "loss": 0.1027,
152
+ "mean_token_accuracy": 0.9708205509185791,
153
+ "num_tokens": 2287233.0,
154
  "step": 400
155
  },
156
+ {
157
+ "epoch": 1.0,
158
+ "eval_loss": 0.08644451200962067,
159
+ "eval_mean_token_accuracy": 0.9759664358319463,
160
+ "eval_num_tokens": 2367283.0,
161
+ "eval_runtime": 72.5321,
162
+ "eval_samples_per_second": 5.087,
163
+ "eval_steps_per_second": 2.551,
164
+ "step": 415
165
+ },
166
  {
167
  "epoch": 1.024140012070006,
168
+ "grad_norm": 0.12701059877872467,
169
+ "learning_rate": 0.00026179755235366435,
170
+ "loss": 0.0911,
171
+ "mean_token_accuracy": 0.9744510755096514,
172
+ "num_tokens": 2436776.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
+ "grad_norm": 0.18361371755599976,
178
+ "learning_rate": 0.00025678792103916504,
179
+ "loss": 0.0605,
180
+ "mean_token_accuracy": 0.9830399179458618,
181
+ "num_tokens": 2579134.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
+ "grad_norm": 0.14009377360343933,
187
+ "learning_rate": 0.0002515242135071945,
188
+ "loss": 0.0841,
189
+ "mean_token_accuracy": 0.9770982998609543,
190
+ "num_tokens": 2723253.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
+ "grad_norm": 0.2148066610097885,
196
+ "learning_rate": 0.0002460189534859663,
197
+ "loss": 0.0574,
198
+ "mean_token_accuracy": 0.983966583609581,
199
+ "num_tokens": 2868154.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
+ "grad_norm": 0.13684915006160736,
205
+ "learning_rate": 0.00024028523941984378,
206
+ "loss": 0.0752,
207
+ "mean_token_accuracy": 0.9793112319707871,
208
+ "num_tokens": 3010656.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
+ "grad_norm": 0.1301647573709488,
214
+ "learning_rate": 0.00023433671330473613,
215
+ "loss": 0.0548,
216
+ "mean_token_accuracy": 0.9846732890605927,
217
+ "num_tokens": 3154781.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
+ "grad_norm": 0.12535132467746735,
223
+ "learning_rate": 0.00022818752823024516,
224
+ "loss": 0.0633,
225
+ "mean_token_accuracy": 0.9826712667942047,
226
+ "num_tokens": 3297498.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
+ "grad_norm": 0.1197441816329956,
232
+ "learning_rate": 0.00022185231470578672,
233
+ "loss": 0.0457,
234
+ "mean_token_accuracy": 0.986530932188034,
235
+ "num_tokens": 3439937.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
+ "grad_norm": 0.09161168336868286,
241
+ "learning_rate": 0.00021534614585080636,
242
+ "loss": 0.0661,
243
+ "mean_token_accuracy": 0.980983544588089,
244
+ "num_tokens": 3581658.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
+ "grad_norm": 0.1546151489019394,
250
+ "learning_rate": 0.0002086845015319108,
251
+ "loss": 0.0432,
252
+ "mean_token_accuracy": 0.9872822916507721,
253
+ "num_tokens": 3724450.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
+ "grad_norm": 0.10510735213756561,
259
+ "learning_rate": 0.00020188323153224244,
260
+ "loss": 0.0615,
261
+ "mean_token_accuracy": 0.9828537595272064,
262
+ "num_tokens": 3865114.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
+ "grad_norm": 0.09251394867897034,
268
+ "learning_rate": 0.00019495851784072558,
269
+ "loss": 0.0426,
270
+ "mean_token_accuracy": 0.9873446094989776,
271
+ "num_tokens": 4007753.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
+ "grad_norm": 0.09151948243379593,
277
+ "learning_rate": 0.00018792683615090954,
278
+ "loss": 0.0559,
279
+ "mean_token_accuracy": 0.9842093575000763,
280
+ "num_tokens": 4149621.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
+ "grad_norm": 0.17836125195026398,
286
+ "learning_rate": 0.00018080491666101184,
287
+ "loss": 0.0403,
288
+ "mean_token_accuracy": 0.9877779418230057,
289
+ "num_tokens": 4293130.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
+ "grad_norm": 0.10722211748361588,
295
+ "learning_rate": 0.00017360970426842824,
296
+ "loss": 0.0507,
297
+ "mean_token_accuracy": 0.9855174136161804,
298
+ "num_tokens": 4435313.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
+ "grad_norm": 0.08690394461154938,
304
+ "learning_rate": 0.00016635831825341846,
305
+ "loss": 0.0388,
306
+ "mean_token_accuracy": 0.9888739967346192,
307
+ "num_tokens": 4579734.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
+ "grad_norm": 0.07448932528495789,
313
+ "learning_rate": 0.00015906801154788881,
314
+ "loss": 0.0442,
315
+ "mean_token_accuracy": 0.9876042759418487,
316
+ "num_tokens": 4714195.0,
317
  "step": 825
318
  },
319
+ {
320
+ "epoch": 2.0,
321
+ "eval_loss": 0.04987528547644615,
322
+ "eval_mean_token_accuracy": 0.9863727804776784,
323
+ "eval_num_tokens": 4734566.0,
324
+ "eval_runtime": 72.6082,
325
+ "eval_samples_per_second": 5.082,
326
+ "eval_steps_per_second": 2.548,
327
+ "step": 830
328
+ },
329
  {
330
  "epoch": 2.048280024140012,
331
+ "grad_norm": 0.06745678931474686,
332
+ "learning_rate": 0.00015175612968618376,
333
+ "loss": 0.0459,
334
+ "mean_token_accuracy": 0.9859098005540592,
335
+ "num_tokens": 4863820.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
+ "grad_norm": 0.0881015732884407,
341
+ "learning_rate": 0.00014444006953555127,
342
+ "loss": 0.0295,
343
+ "mean_token_accuracy": 0.9908721047639847,
344
+ "num_tokens": 4997194.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
+ "grad_norm": 0.06334047764539719,
350
+ "learning_rate": 0.00013713723790447483,
351
+ "loss": 0.0496,
352
+ "mean_token_accuracy": 0.9859565341472626,
353
+ "num_tokens": 5149497.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
+ "grad_norm": 0.0663692057132721,
359
+ "learning_rate": 0.0001298650101273517,
360
+ "loss": 0.0272,
361
+ "mean_token_accuracy": 0.9913459432125091,
362
+ "num_tokens": 5284646.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
+ "grad_norm": 0.0703674927353859,
368
+ "learning_rate": 0.00012264068872405698,
369
+ "loss": 0.0408,
370
+ "mean_token_accuracy": 0.9879277718067169,
371
+ "num_tokens": 5436909.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
+ "grad_norm": 0.05159657076001167,
377
+ "learning_rate": 0.00011548146223275205,
378
+ "loss": 0.0293,
379
+ "mean_token_accuracy": 0.9910051214694977,
380
+ "num_tokens": 5569302.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
+ "grad_norm": 0.0799742266535759,
386
+ "learning_rate": 0.00010840436431388485,
387
  "loss": 0.0403,
388
+ "mean_token_accuracy": 0.9877477496862411,
389
+ "num_tokens": 5722331.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
+ "grad_norm": 0.056557413190603256,
395
+ "learning_rate": 0.00010142623322268498,
396
+ "loss": 0.0287,
397
+ "mean_token_accuracy": 0.9909706234931945,
398
+ "num_tokens": 5856158.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
+ "grad_norm": 0.0646870955824852,
404
+ "learning_rate": 9.456367174657846e-05,
405
+ "loss": 0.0423,
406
+ "mean_token_accuracy": 0.987390770316124,
407
+ "num_tokens": 6007433.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
+ "grad_norm": 0.039614204317331314,
413
+ "learning_rate": 8.783300770284126e-05,
414
+ "loss": 0.0271,
415
+ "mean_token_accuracy": 0.9912664991617203,
416
+ "num_tokens": 6141256.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
+ "grad_norm": 0.0858864039182663,
422
+ "learning_rate": 8.125025509047871e-05,
423
+ "loss": 0.0347,
424
+ "mean_token_accuracy": 0.9889630949497223,
425
+ "num_tokens": 6293285.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
+ "grad_norm": 0.09786231815814972,
431
+ "learning_rate": 7.483107598875994e-05,
432
+ "loss": 0.0255,
433
+ "mean_token_accuracy": 0.991877788901329,
434
+ "num_tokens": 6427387.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
+ "grad_norm": 0.060420017689466476,
440
+ "learning_rate": 6.859074329306077e-05,
441
+ "loss": 0.0386,
442
+ "mean_token_accuracy": 0.9882360059022903,
443
+ "num_tokens": 6580270.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
+ "grad_norm": 0.0470358170568943,
449
+ "learning_rate": 6.254410437667635e-05,
450
+ "loss": 0.026,
451
+ "mean_token_accuracy": 0.9918814355134964,
452
+ "num_tokens": 6713537.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
+ "grad_norm": 0.06813222169876099,
458
+ "learning_rate": 5.670554576506152e-05,
459
+ "loss": 0.0429,
460
+ "mean_token_accuracy": 0.9869187504053116,
461
+ "num_tokens": 6867844.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
+ "grad_norm": 0.05573548004031181,
467
+ "learning_rate": 5.108895890654753e-05,
468
+ "loss": 0.0279,
469
+ "mean_token_accuracy": 0.991449517607689,
470
+ "num_tokens": 7002067.0,
471
  "step": 1225
472
+ },
473
+ {
474
+ "epoch": 3.0,
475
+ "eval_loss": 0.043443720787763596,
476
+ "eval_mean_token_accuracy": 0.9880821160368017,
477
+ "eval_num_tokens": 7101849.0,
478
+ "eval_runtime": 72.5056,
479
+ "eval_samples_per_second": 5.089,
480
+ "eval_steps_per_second": 2.552,
481
+ "step": 1245
482
  }
483
  ],
484
  "logging_steps": 25,
485
+ "max_steps": 1660,
486
  "num_input_tokens_seen": 0,
487
+ "num_train_epochs": 4,
488
+ "save_steps": 500,
489
  "stateful_callbacks": {
490
  "TrainerControl": {
491
  "args": {
 
493
  "should_evaluate": false,
494
  "should_log": false,
495
  "should_save": true,
496
+ "should_training_stop": false
497
  },
498
  "attributes": {}
499
  }
500
  },
501
+ "total_flos": 3.265941598976164e+17,
502
  "train_batch_size": 2,
503
  "trial_name": null,
504
  "trial_params": null
checkpoint-1245/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7e0d8c6c509827c1c38daeb1f564df9c52039702bf2e2293954393e7867f804
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa67ef66fca6209c5a81244c077133db48814d04ffb68d47dcf6047e1890fc8f
3
  size 6033
checkpoint-1660/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-8B
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2
checkpoint-1660/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-8B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "k_proj",
29
+ "v_proj",
30
+ "gate_proj",
31
+ "o_proj",
32
+ "down_proj",
33
+ "up_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
checkpoint-1660/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea2ad39cdd86eff3c4df726c9ae56cab758f941a5339c78780151d5f982614d6
3
+ size 349243752
checkpoint-1660/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-1660/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-1660/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1660/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6737cd1e3f29baa257ea7eb2ae89c6342cab13c9142039f6c497af72e8bb36ef
3
+ size 698662547
checkpoint-1660/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d6bb50ea24906e5110a15499d2a9d32288557f095bede9872f9afda5c752b4b
3
+ size 14645
checkpoint-1660/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc85a65ac2e2afe9d530a3895150368dd9e7507622696dbf23bba79d510dda80
3
+ size 1465
checkpoint-1660/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
checkpoint-1660/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69fae70b4e2890472c74ae51adff7a0f50c32b6bfbea38cd97da67fea79a12bb
3
+ size 11422819
checkpoint-1660/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|im_end|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
checkpoint-1660/trainer_state.json ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1660,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.3188655376434326,
15
+ "learning_rate": 0.00014399999999999998,
16
+ "loss": 1.86,
17
+ "mean_token_accuracy": 0.6245196205377579,
18
+ "num_tokens": 157541.0,
19
+ "step": 25
20
+ },
21
+ {
22
+ "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.2658841907978058,
24
+ "learning_rate": 0.000294,
25
+ "loss": 0.8983,
26
+ "mean_token_accuracy": 0.7745399290323257,
27
+ "num_tokens": 284307.0,
28
+ "step": 50
29
+ },
30
+ {
31
+ "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.24192510545253754,
33
+ "learning_rate": 0.00029983554299928354,
34
+ "loss": 0.6083,
35
+ "mean_token_accuracy": 0.8323455977439881,
36
+ "num_tokens": 440617.0,
37
+ "step": 75
38
+ },
39
+ {
40
+ "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.3552621006965637,
42
+ "learning_rate": 0.00029931487386844626,
43
+ "loss": 0.5333,
44
+ "mean_token_accuracy": 0.8501472049951553,
45
+ "num_tokens": 567051.0,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.29719677567481995,
51
+ "learning_rate": 0.000298438945846945,
52
+ "loss": 0.4156,
53
+ "mean_token_accuracy": 0.8798932474851608,
54
+ "num_tokens": 724100.0,
55
+ "step": 125
56
+ },
57
+ {
58
+ "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.30843958258628845,
60
+ "learning_rate": 0.0002972098429951895,
61
+ "loss": 0.3274,
62
+ "mean_token_accuracy": 0.9020548111200333,
63
+ "num_tokens": 852073.0,
64
+ "step": 150
65
+ },
66
+ {
67
+ "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.3006002604961395,
69
+ "learning_rate": 0.0002956304896682979,
70
+ "loss": 0.2719,
71
+ "mean_token_accuracy": 0.9195013505220413,
72
+ "num_tokens": 1012131.0,
73
+ "step": 175
74
+ },
75
+ {
76
+ "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.39749929308891296,
78
+ "learning_rate": 0.00029370464355829616,
79
+ "loss": 0.2304,
80
+ "mean_token_accuracy": 0.9330711585283279,
81
+ "num_tokens": 1138475.0,
82
+ "step": 200
83
+ },
84
+ {
85
+ "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.21084098517894745,
87
+ "learning_rate": 0.00029143688675359184,
88
+ "loss": 0.1776,
89
+ "mean_token_accuracy": 0.9493161207437515,
90
+ "num_tokens": 1298548.0,
91
+ "step": 225
92
+ },
93
+ {
94
+ "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.2666475772857666,
96
+ "learning_rate": 0.000288832614836995,
97
+ "loss": 0.1483,
98
+ "mean_token_accuracy": 0.9569978493452073,
99
+ "num_tokens": 1428047.0,
100
+ "step": 250
101
+ },
102
+ {
103
+ "epoch": 0.663850331925166,
104
+ "grad_norm": 0.291533887386322,
105
+ "learning_rate": 0.00028589802404822455,
106
+ "loss": 0.1563,
107
+ "mean_token_accuracy": 0.9560109853744507,
108
+ "num_tokens": 1586758.0,
109
+ "step": 275
110
+ },
111
+ {
112
+ "epoch": 0.724200362100181,
113
+ "grad_norm": 0.3723059892654419,
114
+ "learning_rate": 0.0002826400965414433,
115
+ "loss": 0.1303,
116
+ "mean_token_accuracy": 0.9621474850177765,
117
+ "num_tokens": 1714018.0,
118
+ "step": 300
119
+ },
120
+ {
121
+ "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.39150121808052063,
123
+ "learning_rate": 0.00027906658377289907,
124
+ "loss": 0.115,
125
+ "mean_token_accuracy": 0.9675602000951767,
126
+ "num_tokens": 1872256.0,
127
+ "step": 325
128
+ },
129
+ {
130
+ "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.3891217112541199,
132
+ "learning_rate": 0.0002751859880581954,
133
+ "loss": 0.1052,
134
+ "mean_token_accuracy": 0.9699741625785827,
135
+ "num_tokens": 1999407.0,
136
+ "step": 350
137
+ },
138
+ {
139
+ "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.15646834671497345,
141
+ "learning_rate": 0.00027100754234307293,
142
+ "loss": 0.095,
143
+ "mean_token_accuracy": 0.9727736663818359,
144
+ "num_tokens": 2159407.0,
145
+ "step": 375
146
+ },
147
+ {
148
+ "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.3074830174446106,
150
+ "learning_rate": 0.00026654118823583243,
151
+ "loss": 0.1027,
152
+ "mean_token_accuracy": 0.9708205509185791,
153
+ "num_tokens": 2287233.0,
154
+ "step": 400
155
+ },
156
+ {
157
+ "epoch": 1.0,
158
+ "eval_loss": 0.08644451200962067,
159
+ "eval_mean_token_accuracy": 0.9759664358319463,
160
+ "eval_num_tokens": 2367283.0,
161
+ "eval_runtime": 72.5321,
162
+ "eval_samples_per_second": 5.087,
163
+ "eval_steps_per_second": 2.551,
164
+ "step": 415
165
+ },
166
+ {
167
+ "epoch": 1.024140012070006,
168
+ "grad_norm": 0.12701059877872467,
169
+ "learning_rate": 0.00026179755235366435,
170
+ "loss": 0.0911,
171
+ "mean_token_accuracy": 0.9744510755096514,
172
+ "num_tokens": 2436776.0,
173
+ "step": 425
174
+ },
175
+ {
176
+ "epoch": 1.0844900422450212,
177
+ "grad_norm": 0.18361371755599976,
178
+ "learning_rate": 0.00025678792103916504,
179
+ "loss": 0.0605,
180
+ "mean_token_accuracy": 0.9830399179458618,
181
+ "num_tokens": 2579134.0,
182
+ "step": 450
183
+ },
184
+ {
185
+ "epoch": 1.1448400724200363,
186
+ "grad_norm": 0.14009377360343933,
187
+ "learning_rate": 0.0002515242135071945,
188
+ "loss": 0.0841,
189
+ "mean_token_accuracy": 0.9770982998609543,
190
+ "num_tokens": 2723253.0,
191
+ "step": 475
192
+ },
193
+ {
194
+ "epoch": 1.2051901025950513,
195
+ "grad_norm": 0.2148066610097885,
196
+ "learning_rate": 0.0002460189534859663,
197
+ "loss": 0.0574,
198
+ "mean_token_accuracy": 0.983966583609581,
199
+ "num_tokens": 2868154.0,
200
+ "step": 500
201
+ },
202
+ {
203
+ "epoch": 1.2655401327700664,
204
+ "grad_norm": 0.13684915006160736,
205
+ "learning_rate": 0.00024028523941984378,
206
+ "loss": 0.0752,
207
+ "mean_token_accuracy": 0.9793112319707871,
208
+ "num_tokens": 3010656.0,
209
+ "step": 525
210
+ },
211
+ {
212
+ "epoch": 1.3258901629450814,
213
+ "grad_norm": 0.1301647573709488,
214
+ "learning_rate": 0.00023433671330473613,
215
+ "loss": 0.0548,
216
+ "mean_token_accuracy": 0.9846732890605927,
217
+ "num_tokens": 3154781.0,
218
+ "step": 550
219
+ },
220
+ {
221
+ "epoch": 1.3862401931200965,
222
+ "grad_norm": 0.12535132467746735,
223
+ "learning_rate": 0.00022818752823024516,
224
+ "loss": 0.0633,
225
+ "mean_token_accuracy": 0.9826712667942047,
226
+ "num_tokens": 3297498.0,
227
+ "step": 575
228
+ },
229
+ {
230
+ "epoch": 1.4465902232951118,
231
+ "grad_norm": 0.1197441816329956,
232
+ "learning_rate": 0.00022185231470578672,
233
+ "loss": 0.0457,
234
+ "mean_token_accuracy": 0.986530932188034,
235
+ "num_tokens": 3439937.0,
236
+ "step": 600
237
+ },
238
+ {
239
+ "epoch": 1.5069402534701268,
240
+ "grad_norm": 0.09161168336868286,
241
+ "learning_rate": 0.00021534614585080636,
242
+ "loss": 0.0661,
243
+ "mean_token_accuracy": 0.980983544588089,
244
+ "num_tokens": 3581658.0,
245
+ "step": 625
246
+ },
247
+ {
248
+ "epoch": 1.567290283645142,
249
+ "grad_norm": 0.1546151489019394,
250
+ "learning_rate": 0.0002086845015319108,
251
+ "loss": 0.0432,
252
+ "mean_token_accuracy": 0.9872822916507721,
253
+ "num_tokens": 3724450.0,
254
+ "step": 650
255
+ },
256
+ {
257
+ "epoch": 1.627640313820157,
258
+ "grad_norm": 0.10510735213756561,
259
+ "learning_rate": 0.00020188323153224244,
260
+ "loss": 0.0615,
261
+ "mean_token_accuracy": 0.9828537595272064,
262
+ "num_tokens": 3865114.0,
263
+ "step": 675
264
+ },
265
+ {
266
+ "epoch": 1.687990343995172,
267
+ "grad_norm": 0.09251394867897034,
268
+ "learning_rate": 0.00019495851784072558,
269
+ "loss": 0.0426,
270
+ "mean_token_accuracy": 0.9873446094989776,
271
+ "num_tokens": 4007753.0,
272
+ "step": 700
273
+ },
274
+ {
275
+ "epoch": 1.748340374170187,
276
+ "grad_norm": 0.09151948243379593,
277
+ "learning_rate": 0.00018792683615090954,
278
+ "loss": 0.0559,
279
+ "mean_token_accuracy": 0.9842093575000763,
280
+ "num_tokens": 4149621.0,
281
+ "step": 725
282
+ },
283
+ {
284
+ "epoch": 1.8086904043452021,
285
+ "grad_norm": 0.17836125195026398,
286
+ "learning_rate": 0.00018080491666101184,
287
+ "loss": 0.0403,
288
+ "mean_token_accuracy": 0.9877779418230057,
289
+ "num_tokens": 4293130.0,
290
+ "step": 750
291
+ },
292
+ {
293
+ "epoch": 1.8690404345202172,
294
+ "grad_norm": 0.10722211748361588,
295
+ "learning_rate": 0.00017360970426842824,
296
+ "loss": 0.0507,
297
+ "mean_token_accuracy": 0.9855174136161804,
298
+ "num_tokens": 4435313.0,
299
+ "step": 775
300
+ },
301
+ {
302
+ "epoch": 1.9293904646952322,
303
+ "grad_norm": 0.08690394461154938,
304
+ "learning_rate": 0.00016635831825341846,
305
+ "loss": 0.0388,
306
+ "mean_token_accuracy": 0.9888739967346192,
307
+ "num_tokens": 4579734.0,
308
+ "step": 800
309
+ },
310
+ {
311
+ "epoch": 1.9897404948702473,
312
+ "grad_norm": 0.07448932528495789,
313
+ "learning_rate": 0.00015906801154788881,
314
+ "loss": 0.0442,
315
+ "mean_token_accuracy": 0.9876042759418487,
316
+ "num_tokens": 4714195.0,
317
+ "step": 825
318
+ },
319
+ {
320
+ "epoch": 2.0,
321
+ "eval_loss": 0.04987528547644615,
322
+ "eval_mean_token_accuracy": 0.9863727804776784,
323
+ "eval_num_tokens": 4734566.0,
324
+ "eval_runtime": 72.6082,
325
+ "eval_samples_per_second": 5.082,
326
+ "eval_steps_per_second": 2.548,
327
+ "step": 830
328
+ },
329
+ {
330
+ "epoch": 2.048280024140012,
331
+ "grad_norm": 0.06745678931474686,
332
+ "learning_rate": 0.00015175612968618376,
333
+ "loss": 0.0459,
334
+ "mean_token_accuracy": 0.9859098005540592,
335
+ "num_tokens": 4863820.0,
336
+ "step": 850
337
+ },
338
+ {
339
+ "epoch": 2.1086300543150274,
340
+ "grad_norm": 0.0881015732884407,
341
+ "learning_rate": 0.00014444006953555127,
342
+ "loss": 0.0295,
343
+ "mean_token_accuracy": 0.9908721047639847,
344
+ "num_tokens": 4997194.0,
345
+ "step": 875
346
+ },
347
+ {
348
+ "epoch": 2.1689800844900424,
349
+ "grad_norm": 0.06334047764539719,
350
+ "learning_rate": 0.00013713723790447483,
351
+ "loss": 0.0496,
352
+ "mean_token_accuracy": 0.9859565341472626,
353
+ "num_tokens": 5149497.0,
354
+ "step": 900
355
+ },
356
+ {
357
+ "epoch": 2.2293301146650575,
358
+ "grad_norm": 0.0663692057132721,
359
+ "learning_rate": 0.0001298650101273517,
360
+ "loss": 0.0272,
361
+ "mean_token_accuracy": 0.9913459432125091,
362
+ "num_tokens": 5284646.0,
363
+ "step": 925
364
+ },
365
+ {
366
+ "epoch": 2.2896801448400725,
367
+ "grad_norm": 0.0703674927353859,
368
+ "learning_rate": 0.00012264068872405698,
369
+ "loss": 0.0408,
370
+ "mean_token_accuracy": 0.9879277718067169,
371
+ "num_tokens": 5436909.0,
372
+ "step": 950
373
+ },
374
+ {
375
+ "epoch": 2.3500301750150876,
376
+ "grad_norm": 0.05159657076001167,
377
+ "learning_rate": 0.00011548146223275205,
378
+ "loss": 0.0293,
379
+ "mean_token_accuracy": 0.9910051214694977,
380
+ "num_tokens": 5569302.0,
381
+ "step": 975
382
+ },
383
+ {
384
+ "epoch": 2.4103802051901027,
385
+ "grad_norm": 0.0799742266535759,
386
+ "learning_rate": 0.00010840436431388485,
387
+ "loss": 0.0403,
388
+ "mean_token_accuracy": 0.9877477496862411,
389
+ "num_tokens": 5722331.0,
390
+ "step": 1000
391
+ },
392
+ {
393
+ "epoch": 2.4707302353651177,
394
+ "grad_norm": 0.056557413190603256,
395
+ "learning_rate": 0.00010142623322268498,
396
+ "loss": 0.0287,
397
+ "mean_token_accuracy": 0.9909706234931945,
398
+ "num_tokens": 5856158.0,
399
+ "step": 1025
400
+ },
401
+ {
402
+ "epoch": 2.5310802655401328,
403
+ "grad_norm": 0.0646870955824852,
404
+ "learning_rate": 9.456367174657846e-05,
405
+ "loss": 0.0423,
406
+ "mean_token_accuracy": 0.987390770316124,
407
+ "num_tokens": 6007433.0,
408
+ "step": 1050
409
+ },
410
+ {
411
+ "epoch": 2.591430295715148,
412
+ "grad_norm": 0.039614204317331314,
413
+ "learning_rate": 8.783300770284126e-05,
414
+ "loss": 0.0271,
415
+ "mean_token_accuracy": 0.9912664991617203,
416
+ "num_tokens": 6141256.0,
417
+ "step": 1075
418
+ },
419
+ {
420
+ "epoch": 2.651780325890163,
421
+ "grad_norm": 0.0858864039182663,
422
+ "learning_rate": 8.125025509047871e-05,
423
+ "loss": 0.0347,
424
+ "mean_token_accuracy": 0.9889630949497223,
425
+ "num_tokens": 6293285.0,
426
+ "step": 1100
427
+ },
428
+ {
429
+ "epoch": 2.712130356065178,
430
+ "grad_norm": 0.09786231815814972,
431
+ "learning_rate": 7.483107598875994e-05,
432
+ "loss": 0.0255,
433
+ "mean_token_accuracy": 0.991877788901329,
434
+ "num_tokens": 6427387.0,
435
+ "step": 1125
436
+ },
437
+ {
438
+ "epoch": 2.772480386240193,
439
+ "grad_norm": 0.060420017689466476,
440
+ "learning_rate": 6.859074329306077e-05,
441
+ "loss": 0.0386,
442
+ "mean_token_accuracy": 0.9882360059022903,
443
+ "num_tokens": 6580270.0,
444
+ "step": 1150
445
+ },
446
+ {
447
+ "epoch": 2.832830416415208,
448
+ "grad_norm": 0.0470358170568943,
449
+ "learning_rate": 6.254410437667635e-05,
450
+ "loss": 0.026,
451
+ "mean_token_accuracy": 0.9918814355134964,
452
+ "num_tokens": 6713537.0,
453
+ "step": 1175
454
+ },
455
+ {
456
+ "epoch": 2.8931804465902236,
457
+ "grad_norm": 0.06813222169876099,
458
+ "learning_rate": 5.670554576506152e-05,
459
+ "loss": 0.0429,
460
+ "mean_token_accuracy": 0.9869187504053116,
461
+ "num_tokens": 6867844.0,
462
+ "step": 1200
463
+ },
464
+ {
465
+ "epoch": 2.9535304767652386,
466
+ "grad_norm": 0.05573548004031181,
467
+ "learning_rate": 5.108895890654753e-05,
468
+ "loss": 0.0279,
469
+ "mean_token_accuracy": 0.991449517607689,
470
+ "num_tokens": 7002067.0,
471
+ "step": 1225
472
+ },
473
+ {
474
+ "epoch": 3.0,
475
+ "eval_loss": 0.043443720787763596,
476
+ "eval_mean_token_accuracy": 0.9880821160368017,
477
+ "eval_num_tokens": 7101849.0,
478
+ "eval_runtime": 72.5056,
479
+ "eval_samples_per_second": 5.089,
480
+ "eval_steps_per_second": 2.552,
481
+ "step": 1245
482
+ },
483
+ {
484
+ "epoch": 3.012070006035003,
485
+ "grad_norm": 0.05211171880364418,
486
+ "learning_rate": 4.570770712097641e-05,
487
+ "loss": 0.0354,
488
+ "mean_token_accuracy": 0.98871831119675,
489
+ "num_tokens": 7138755.0,
490
+ "step": 1250
491
+ },
492
+ {
493
+ "epoch": 3.0724200362100182,
494
+ "grad_norm": 0.07240189611911774,
495
+ "learning_rate": 4.057459380488989e-05,
496
+ "loss": 0.0234,
497
+ "mean_token_accuracy": 0.9925487804412841,
498
+ "num_tokens": 7287577.0,
499
+ "step": 1275
500
+ },
501
+ {
502
+ "epoch": 3.1327700663850333,
503
+ "grad_norm": 0.10089480876922607,
504
+ "learning_rate": 3.570183196892178e-05,
505
+ "loss": 0.0316,
506
+ "mean_token_accuracy": 0.9904714208841324,
507
+ "num_tokens": 7423900.0,
508
+ "step": 1300
509
+ },
510
+ {
511
+ "epoch": 3.1931200965600484,
512
+ "grad_norm": 0.07049904763698578,
513
+ "learning_rate": 3.110101517987129e-05,
514
+ "loss": 0.0245,
515
+ "mean_token_accuracy": 0.9921106594800949,
516
+ "num_tokens": 7573131.0,
517
+ "step": 1325
518
+ },
519
+ {
520
+ "epoch": 3.2534701267350634,
521
+ "grad_norm": 0.09604395180940628,
522
+ "learning_rate": 2.6783089976594708e-05,
523
+ "loss": 0.0288,
524
+ "mean_token_accuracy": 0.9908786052465439,
525
+ "num_tokens": 7708680.0,
526
+ "step": 1350
527
+ },
528
+ {
529
+ "epoch": 3.3138201569100785,
530
+ "grad_norm": 0.07211136817932129,
531
+ "learning_rate": 2.2758329825344545e-05,
532
+ "loss": 0.0234,
533
+ "mean_token_accuracy": 0.9924190586805344,
534
+ "num_tokens": 7856618.0,
535
+ "step": 1375
536
+ },
537
+ {
538
+ "epoch": 3.3741701870850935,
539
+ "grad_norm": 0.0694340318441391,
540
+ "learning_rate": 1.9036310676523382e-05,
541
+ "loss": 0.0258,
542
+ "mean_token_accuracy": 0.9920255327224732,
543
+ "num_tokens": 7992886.0,
544
+ "step": 1400
545
+ },
546
+ {
547
+ "epoch": 3.4345202172601086,
548
+ "grad_norm": 0.05492745339870453,
549
+ "learning_rate": 1.5625888181008965e-05,
550
+ "loss": 0.0233,
551
+ "mean_token_accuracy": 0.9923601657152176,
552
+ "num_tokens": 8142240.0,
553
+ "step": 1425
554
+ },
555
+ {
556
+ "epoch": 3.4948702474351236,
557
+ "grad_norm": 0.10188309103250504,
558
+ "learning_rate": 1.2535176620259418e-05,
559
+ "loss": 0.0274,
560
+ "mean_token_accuracy": 0.9914963799715042,
561
+ "num_tokens": 8280355.0,
562
+ "step": 1450
563
+ },
564
+ {
565
+ "epoch": 3.5552202776101387,
566
+ "grad_norm": 0.06785538047552109,
567
+ "learning_rate": 9.771529600328754e-06,
568
+ "loss": 0.0251,
569
+ "mean_token_accuracy": 0.991920046210289,
570
+ "num_tokens": 8429816.0,
571
+ "step": 1475
572
+ },
573
+ {
574
+ "epoch": 3.6155703077851538,
575
+ "grad_norm": 0.08000839501619339,
576
+ "learning_rate": 7.34152255572697e-06,
577
+ "loss": 0.0258,
578
+ "mean_token_accuracy": 0.9917354655265808,
579
+ "num_tokens": 8567287.0,
580
+ "step": 1500
581
+ },
582
+ {
583
+ "epoch": 3.675920337960169,
584
+ "grad_norm": 0.06494925171136856,
585
+ "learning_rate": 5.250937104752384e-06,
586
+ "loss": 0.0227,
587
+ "mean_token_accuracy": 0.9926939576864242,
588
+ "num_tokens": 8717200.0,
589
+ "step": 1525
590
+ },
591
+ {
592
+ "epoch": 3.736270368135184,
593
+ "grad_norm": 0.06809823215007782,
594
+ "learning_rate": 3.5047472935191723e-06,
595
+ "loss": 0.0267,
596
+ "mean_token_accuracy": 0.9914516353607178,
597
+ "num_tokens": 8854512.0,
598
+ "step": 1550
599
+ },
600
+ {
601
+ "epoch": 3.796620398310199,
602
+ "grad_norm": 0.06002349779009819,
603
+ "learning_rate": 2.1071077614088605e-06,
604
+ "loss": 0.0248,
605
+ "mean_token_accuracy": 0.9922404575347901,
606
+ "num_tokens": 9002707.0,
607
+ "step": 1575
608
+ },
609
+ {
610
+ "epoch": 3.856970428485214,
611
+ "grad_norm": 0.07697267085313797,
612
+ "learning_rate": 1.0613438561036302e-06,
613
+ "loss": 0.0259,
614
+ "mean_token_accuracy": 0.9919641929864883,
615
+ "num_tokens": 9138803.0,
616
+ "step": 1600
617
+ },
618
+ {
619
+ "epoch": 3.9173204586602295,
620
+ "grad_norm": 0.056576263159513474,
621
+ "learning_rate": 3.6994372171977317e-07,
622
+ "loss": 0.0229,
623
+ "mean_token_accuracy": 0.992724329829216,
624
+ "num_tokens": 9288237.0,
625
+ "step": 1625
626
+ },
627
+ {
628
+ "epoch": 3.9776704888352445,
629
+ "grad_norm": 0.07688049972057343,
630
+ "learning_rate": 3.455237886632045e-08,
631
+ "loss": 0.0234,
632
+ "mean_token_accuracy": 0.992882153391838,
633
+ "num_tokens": 9421425.0,
634
+ "step": 1650
635
+ },
636
+ {
637
+ "epoch": 4.0,
638
+ "eval_loss": 0.04342251271009445,
639
+ "eval_mean_token_accuracy": 0.988437723791277,
640
+ "eval_num_tokens": 9469132.0,
641
+ "eval_runtime": 72.5212,
642
+ "eval_samples_per_second": 5.088,
643
+ "eval_steps_per_second": 2.551,
644
+ "step": 1660
645
+ }
646
+ ],
647
+ "logging_steps": 25,
648
+ "max_steps": 1660,
649
+ "num_input_tokens_seen": 0,
650
+ "num_train_epochs": 4,
651
+ "save_steps": 500,
652
+ "stateful_callbacks": {
653
+ "TrainerControl": {
654
+ "args": {
655
+ "should_epoch_stop": false,
656
+ "should_evaluate": false,
657
+ "should_log": false,
658
+ "should_save": true,
659
+ "should_training_stop": true
660
+ },
661
+ "attributes": {}
662
+ }
663
+ },
664
+ "total_flos": 4.354643154101023e+17,
665
+ "train_batch_size": 2,
666
+ "trial_name": null,
667
+ "trial_params": null
668
+ }
checkpoint-1660/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa67ef66fca6209c5a81244c077133db48814d04ffb68d47dcf6047e1890fc8f
3
+ size 6033
checkpoint-1660/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-415/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-8B
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2
checkpoint-415/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-8B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "k_proj",
29
+ "v_proj",
30
+ "gate_proj",
31
+ "o_proj",
32
+ "down_proj",
33
+ "up_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
checkpoint-415/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44999bc24686256a3ea4c97089d58ddd2327c87b1ea499cf33cb68c00a8f5a95
3
+ size 349243752
checkpoint-415/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-415/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-415/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-415/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a138f7115f2f059fcb953fd24c1f5dd3a8879bcd1888768b28d78a442df8aa2
3
+ size 698662547
checkpoint-415/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a0d2294425ffc300740dd935430fffd4cd109cb2876b1b773f8cb61d0a6153
3
+ size 14645
checkpoint-415/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b01c172f8ab5b63a14eea628ee0fae82d2bb80e4fd533a8eab533314c8ec1612
3
+ size 1465
checkpoint-415/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
checkpoint-415/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69fae70b4e2890472c74ae51adff7a0f50c32b6bfbea38cd97da67fea79a12bb
3
+ size 11422819
checkpoint-415/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|im_end|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
checkpoint-415/trainer_state.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 415,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.3188655376434326,
15
+ "learning_rate": 0.00014399999999999998,
16
+ "loss": 1.86,
17
+ "mean_token_accuracy": 0.6245196205377579,
18
+ "num_tokens": 157541.0,
19
+ "step": 25
20
+ },
21
+ {
22
+ "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.2658841907978058,
24
+ "learning_rate": 0.000294,
25
+ "loss": 0.8983,
26
+ "mean_token_accuracy": 0.7745399290323257,
27
+ "num_tokens": 284307.0,
28
+ "step": 50
29
+ },
30
+ {
31
+ "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.24192510545253754,
33
+ "learning_rate": 0.00029983554299928354,
34
+ "loss": 0.6083,
35
+ "mean_token_accuracy": 0.8323455977439881,
36
+ "num_tokens": 440617.0,
37
+ "step": 75
38
+ },
39
+ {
40
+ "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.3552621006965637,
42
+ "learning_rate": 0.00029931487386844626,
43
+ "loss": 0.5333,
44
+ "mean_token_accuracy": 0.8501472049951553,
45
+ "num_tokens": 567051.0,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.29719677567481995,
51
+ "learning_rate": 0.000298438945846945,
52
+ "loss": 0.4156,
53
+ "mean_token_accuracy": 0.8798932474851608,
54
+ "num_tokens": 724100.0,
55
+ "step": 125
56
+ },
57
+ {
58
+ "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.30843958258628845,
60
+ "learning_rate": 0.0002972098429951895,
61
+ "loss": 0.3274,
62
+ "mean_token_accuracy": 0.9020548111200333,
63
+ "num_tokens": 852073.0,
64
+ "step": 150
65
+ },
66
+ {
67
+ "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.3006002604961395,
69
+ "learning_rate": 0.0002956304896682979,
70
+ "loss": 0.2719,
71
+ "mean_token_accuracy": 0.9195013505220413,
72
+ "num_tokens": 1012131.0,
73
+ "step": 175
74
+ },
75
+ {
76
+ "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.39749929308891296,
78
+ "learning_rate": 0.00029370464355829616,
79
+ "loss": 0.2304,
80
+ "mean_token_accuracy": 0.9330711585283279,
81
+ "num_tokens": 1138475.0,
82
+ "step": 200
83
+ },
84
+ {
85
+ "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.21084098517894745,
87
+ "learning_rate": 0.00029143688675359184,
88
+ "loss": 0.1776,
89
+ "mean_token_accuracy": 0.9493161207437515,
90
+ "num_tokens": 1298548.0,
91
+ "step": 225
92
+ },
93
+ {
94
+ "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.2666475772857666,
96
+ "learning_rate": 0.000288832614836995,
97
+ "loss": 0.1483,
98
+ "mean_token_accuracy": 0.9569978493452073,
99
+ "num_tokens": 1428047.0,
100
+ "step": 250
101
+ },
102
+ {
103
+ "epoch": 0.663850331925166,
104
+ "grad_norm": 0.291533887386322,
105
+ "learning_rate": 0.00028589802404822455,
106
+ "loss": 0.1563,
107
+ "mean_token_accuracy": 0.9560109853744507,
108
+ "num_tokens": 1586758.0,
109
+ "step": 275
110
+ },
111
+ {
112
+ "epoch": 0.724200362100181,
113
+ "grad_norm": 0.3723059892654419,
114
+ "learning_rate": 0.0002826400965414433,
115
+ "loss": 0.1303,
116
+ "mean_token_accuracy": 0.9621474850177765,
117
+ "num_tokens": 1714018.0,
118
+ "step": 300
119
+ },
120
+ {
121
+ "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.39150121808052063,
123
+ "learning_rate": 0.00027906658377289907,
124
+ "loss": 0.115,
125
+ "mean_token_accuracy": 0.9675602000951767,
126
+ "num_tokens": 1872256.0,
127
+ "step": 325
128
+ },
129
+ {
130
+ "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.3891217112541199,
132
+ "learning_rate": 0.0002751859880581954,
133
+ "loss": 0.1052,
134
+ "mean_token_accuracy": 0.9699741625785827,
135
+ "num_tokens": 1999407.0,
136
+ "step": 350
137
+ },
138
+ {
139
+ "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.15646834671497345,
141
+ "learning_rate": 0.00027100754234307293,
142
+ "loss": 0.095,
143
+ "mean_token_accuracy": 0.9727736663818359,
144
+ "num_tokens": 2159407.0,
145
+ "step": 375
146
+ },
147
+ {
148
+ "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.3074830174446106,
150
+ "learning_rate": 0.00026654118823583243,
151
+ "loss": 0.1027,
152
+ "mean_token_accuracy": 0.9708205509185791,
153
+ "num_tokens": 2287233.0,
154
+ "step": 400
155
+ },
156
+ {
157
+ "epoch": 1.0,
158
+ "eval_loss": 0.08644451200962067,
159
+ "eval_mean_token_accuracy": 0.9759664358319463,
160
+ "eval_num_tokens": 2367283.0,
161
+ "eval_runtime": 72.5321,
162
+ "eval_samples_per_second": 5.087,
163
+ "eval_steps_per_second": 2.551,
164
+ "step": 415
165
+ }
166
+ ],
167
+ "logging_steps": 25,
168
+ "max_steps": 1660,
169
+ "num_input_tokens_seen": 0,
170
+ "num_train_epochs": 4,
171
+ "save_steps": 500,
172
+ "stateful_callbacks": {
173
+ "TrainerControl": {
174
+ "args": {
175
+ "should_epoch_stop": false,
176
+ "should_evaluate": false,
177
+ "should_log": false,
178
+ "should_save": true,
179
+ "should_training_stop": false
180
+ },
181
+ "attributes": {}
182
+ }
183
+ },
184
+ "total_flos": 1.0886023372599091e+17,
185
+ "train_batch_size": 2,
186
+ "trial_name": null,
187
+ "trial_params": null
188
+ }
checkpoint-415/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa67ef66fca6209c5a81244c077133db48814d04ffb68d47dcf6047e1890fc8f
3
+ size 6033
checkpoint-415/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-830/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-8B
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2
checkpoint-830/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-8B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "k_proj",
29
+ "v_proj",
30
+ "gate_proj",
31
+ "o_proj",
32
+ "down_proj",
33
+ "up_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
checkpoint-830/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a71f6d23018367f729e2f625ac3a9122f7cb0a0bbcb57b6c4b5e7cdc3e0eefb
3
+ size 349243752
checkpoint-830/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-830/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
checkpoint-830/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-830/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce3d77ae797509eac1c38d715e000a0c9e1b991f6d8b6fa26277f31346fde9aa
3
+ size 698662547
checkpoint-830/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef51f7f681b288c283f6c6ee22f1c70756b935d9e14a1eb052ba8b2bddfcfaa8
3
+ size 14645
checkpoint-830/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1820d810469d58a18885a80d0098317f425057406231d2e24bae3e0f07f43dd7
3
+ size 1465
checkpoint-830/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }