mrplants commited on
Commit
f608d03
·
verified ·
1 Parent(s): f11a57b

Training in progress, epoch 10, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "Phi4MMForCausalLM"
5
  ],
6
  "attention_bias": false,
7
- "attention_dropout": 0.0,
8
  "audio_processor": {
9
  "config": {
10
  "activation": "swish",
@@ -53,6 +53,7 @@
53
  "AutoTokenizer": "microsoft/Phi-4-multimodal-instruct--Xenova/gpt-4o"
54
  },
55
  "bos_token_id": 199999,
 
56
  "embd_layer": {
57
  "audio_embd_layer": {
58
  "compression_rate": 8,
 
4
  "Phi4MMForCausalLM"
5
  ],
6
  "attention_bias": false,
7
+ "attention_dropout": 0.1,
8
  "audio_processor": {
9
  "config": {
10
  "activation": "swish",
 
53
  "AutoTokenizer": "microsoft/Phi-4-multimodal-instruct--Xenova/gpt-4o"
54
  },
55
  "bos_token_id": 199999,
56
+ "dropout": 0.1,
57
  "embd_layer": {
58
  "audio_embd_layer": {
59
  "compression_rate": 8,
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2dba202c4bd78baf747939330a784ddb3edf466e2590e23ae264ea8c0bf8af4e
3
  size 4998420448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b938ce88ce1cffbca9406119f3bf3d6a8c7c221672c70d7f6a014ed31288208e
3
  size 4998420448
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbb84b667bd07926038796d7587907cbfcbc94dad52669f9e5e32c125b107d10
3
  size 4983891952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b3aa7d526316649da9fe097edf86c134909ce89af2c5e4be748b52708a2e5b
3
  size 4983891952
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b0f9f026114b452f2afea019b04b6f49b6c86c4f8aa8ad865e681b8a1634355
3
  size 1905111704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:115e7c916a84bb2ca309529ea182bfd9a9fc6603de3b3220122143164643d022
3
  size 1905111704
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87c6c051e11b415870596076dfafbffe58ceab60bb8ee77b20d335152338d956
3
  size 15344257558
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4503be4bed2bdc196f9aa5b48728866e8bb65dd630a8dd2e33efcd021e17858d
3
  size 15344257558
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c70a41264c08a1e8401b7173fe9901cfca41eb5cf987bd975ed722bcda9db818
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac0389b5da961b38667013030da96e0e998cdc2366307000dfb275a026d99b15
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:017002c2d629e5f1c7bb7237618315ba17edcbbd589c1b4ae48239e8f9a1d79f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:782bc8820f2e383abba755c44f38af6b90e6ed80083b23c764e330fe7a1a700b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,837 +1,796 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 9.0,
5
  "eval_steps": 500,
6
- "global_step": 2682,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08389261744966443,
13
- "grad_norm": 52.25,
14
- "learning_rate": 5.555555555555557e-06,
15
- "loss": 4.2261,
16
  "step": 25
17
  },
18
  {
19
- "epoch": 0.16778523489932887,
20
- "grad_norm": 30.625,
21
- "learning_rate": 1.1111111111111113e-05,
22
- "loss": 2.4269,
23
  "step": 50
24
  },
25
  {
26
- "epoch": 0.2516778523489933,
27
- "grad_norm": 25.5,
28
- "learning_rate": 1.6666666666666667e-05,
29
- "loss": 1.6724,
30
  "step": 75
31
  },
32
  {
33
- "epoch": 0.33557046979865773,
34
- "grad_norm": 18.0,
35
- "learning_rate": 1.9999409160138695e-05,
36
- "loss": 1.6634,
37
  "step": 100
38
  },
39
  {
40
- "epoch": 0.41946308724832215,
41
- "grad_norm": 19.375,
42
- "learning_rate": 1.9992763013493023e-05,
43
- "loss": 1.4662,
44
  "step": 125
45
  },
46
  {
47
- "epoch": 0.5033557046979866,
48
- "grad_norm": 16.25,
49
- "learning_rate": 1.9978737094995525e-05,
50
- "loss": 1.5513,
51
  "step": 150
52
  },
53
  {
54
- "epoch": 0.587248322147651,
55
- "grad_norm": 18.125,
56
- "learning_rate": 1.9957341762950346e-05,
57
- "loss": 1.4649,
58
  "step": 175
59
  },
60
  {
61
- "epoch": 0.6711409395973155,
62
- "grad_norm": 17.0,
63
- "learning_rate": 1.992859281805935e-05,
64
- "loss": 1.4342,
65
  "step": 200
66
  },
67
  {
68
- "epoch": 0.7550335570469798,
69
- "grad_norm": 15.375,
70
- "learning_rate": 1.9892511491753126e-05,
71
- "loss": 1.4756,
72
  "step": 225
73
  },
74
  {
75
- "epoch": 0.8389261744966443,
76
- "grad_norm": 19.0,
77
- "learning_rate": 1.984912443051131e-05,
78
- "loss": 1.4059,
79
  "step": 250
80
  },
81
  {
82
- "epoch": 0.9228187919463087,
83
- "grad_norm": 13.75,
84
- "learning_rate": 1.9798463676183887e-05,
85
- "loss": 1.437,
86
- "step": 275
 
87
  },
88
  {
89
- "epoch": 1.0,
90
- "eval_loss": 1.4382814168930054,
91
- "eval_runtime": 1.8726,
92
- "eval_samples_per_second": 67.287,
93
- "eval_steps_per_second": 8.544,
94
- "step": 298
95
  },
96
  {
97
- "epoch": 1.0067114093959733,
98
- "grad_norm": 11.0625,
99
- "learning_rate": 1.9740566642327868e-05,
100
- "loss": 1.3453,
101
  "step": 300
102
  },
103
  {
104
- "epoch": 1.0906040268456376,
105
- "grad_norm": 16.875,
106
- "learning_rate": 1.967547608657697e-05,
107
- "loss": 1.0968,
108
  "step": 325
109
  },
110
  {
111
- "epoch": 1.174496644295302,
112
- "grad_norm": 17.0,
113
- "learning_rate": 1.9603240079064605e-05,
114
- "loss": 1.0313,
115
  "step": 350
116
  },
117
  {
118
- "epoch": 1.2583892617449663,
119
- "grad_norm": 19.0,
120
- "learning_rate": 1.9523911966923506e-05,
121
- "loss": 1.1095,
122
  "step": 375
123
  },
124
  {
125
- "epoch": 1.342281879194631,
126
- "grad_norm": 17.625,
127
- "learning_rate": 1.9437550334888277e-05,
128
- "loss": 1.1064,
129
  "step": 400
130
  },
131
  {
132
- "epoch": 1.4261744966442953,
133
- "grad_norm": 17.125,
134
- "learning_rate": 1.9344218962029856e-05,
135
- "loss": 1.0744,
136
  "step": 425
137
  },
138
  {
139
- "epoch": 1.5100671140939599,
140
- "grad_norm": 15.0625,
141
- "learning_rate": 1.9243986774653954e-05,
142
- "loss": 1.0505,
143
  "step": 450
144
  },
145
  {
146
- "epoch": 1.5939597315436242,
147
- "grad_norm": 16.375,
148
- "learning_rate": 1.9136927795398158e-05,
149
- "loss": 1.0781,
150
  "step": 475
151
  },
152
  {
153
- "epoch": 1.6778523489932886,
154
- "grad_norm": 10.625,
155
- "learning_rate": 1.9023121088565353e-05,
156
- "loss": 1.0273,
157
  "step": 500
158
  },
159
  {
160
- "epoch": 1.761744966442953,
161
- "grad_norm": 14.0625,
162
- "learning_rate": 1.890265070173382e-05,
163
- "loss": 1.1687,
 
 
 
 
 
 
 
 
164
  "step": 525
165
  },
166
  {
167
- "epoch": 1.8456375838926173,
168
- "grad_norm": 19.375,
169
- "learning_rate": 1.8775605603687128e-05,
170
- "loss": 1.0875,
171
  "step": 550
172
  },
173
  {
174
- "epoch": 1.929530201342282,
175
- "grad_norm": 18.75,
176
- "learning_rate": 1.8642079618709627e-05,
177
- "loss": 1.0308,
178
  "step": 575
179
  },
180
  {
181
- "epoch": 2.0,
182
- "eval_loss": 1.3985741138458252,
183
- "eval_runtime": 1.8989,
184
- "eval_samples_per_second": 66.353,
185
- "eval_steps_per_second": 8.426,
186
- "step": 596
187
- },
188
- {
189
- "epoch": 2.0134228187919465,
190
- "grad_norm": 14.125,
191
- "learning_rate": 1.8502171357296144e-05,
192
- "loss": 1.0441,
193
  "step": 600
194
  },
195
  {
196
- "epoch": 2.097315436241611,
197
- "grad_norm": 15.5625,
198
- "learning_rate": 1.8355984143326968e-05,
199
- "loss": 0.7089,
200
  "step": 625
201
  },
202
  {
203
- "epoch": 2.1812080536912752,
204
- "grad_norm": 17.5,
205
- "learning_rate": 1.820362593776198e-05,
206
- "loss": 0.6969,
207
  "step": 650
208
  },
209
  {
210
- "epoch": 2.2651006711409396,
211
- "grad_norm": 17.125,
212
- "learning_rate": 1.804520925891021e-05,
213
- "loss": 0.7308,
214
  "step": 675
215
  },
216
  {
217
- "epoch": 2.348993288590604,
218
- "grad_norm": 19.5,
219
- "learning_rate": 1.7880851099333762e-05,
220
- "loss": 0.7553,
221
  "step": 700
222
  },
223
  {
224
- "epoch": 2.4328859060402683,
225
- "grad_norm": 18.375,
226
- "learning_rate": 1.7710672839447442e-05,
227
- "loss": 0.6881,
228
  "step": 725
229
  },
230
  {
231
- "epoch": 2.5167785234899327,
232
- "grad_norm": 16.375,
233
- "learning_rate": 1.753480015787792e-05,
234
- "loss": 0.744,
235
  "step": 750
236
  },
237
  {
238
- "epoch": 2.600671140939597,
239
- "grad_norm": 23.375,
240
- "learning_rate": 1.735336293864857e-05,
241
- "loss": 0.751,
 
 
 
 
 
 
 
 
242
  "step": 775
243
  },
244
  {
245
- "epoch": 2.684563758389262,
246
- "grad_norm": 15.5,
247
- "learning_rate": 1.7166495175258654e-05,
248
- "loss": 0.6958,
249
  "step": 800
250
  },
251
  {
252
- "epoch": 2.7684563758389262,
253
- "grad_norm": 25.25,
254
- "learning_rate": 1.697433487172752e-05,
255
- "loss": 0.7711,
256
  "step": 825
257
  },
258
  {
259
- "epoch": 2.8523489932885906,
260
- "grad_norm": 13.0625,
261
- "learning_rate": 1.6777023940677036e-05,
262
- "loss": 0.7639,
263
  "step": 850
264
  },
265
  {
266
- "epoch": 2.936241610738255,
267
- "grad_norm": 16.25,
268
- "learning_rate": 1.657470809852749e-05,
269
- "loss": 0.8271,
270
  "step": 875
271
  },
272
  {
273
- "epoch": 3.0,
274
- "eval_loss": 1.5429246425628662,
275
- "eval_runtime": 1.949,
276
- "eval_samples_per_second": 64.648,
277
- "eval_steps_per_second": 8.209,
278
- "step": 894
279
- },
280
- {
281
- "epoch": 3.0201342281879193,
282
- "grad_norm": 15.5,
283
- "learning_rate": 1.6367536757884285e-05,
284
- "loss": 0.6906,
285
  "step": 900
286
  },
287
  {
288
- "epoch": 3.1040268456375837,
289
- "grad_norm": 21.75,
290
- "learning_rate": 1.615566291719502e-05,
291
- "loss": 0.4225,
292
  "step": 925
293
  },
294
  {
295
- "epoch": 3.1879194630872485,
296
- "grad_norm": 18.0,
297
- "learning_rate": 1.5939243047758312e-05,
298
- "loss": 0.4291,
299
  "step": 950
300
  },
301
  {
302
- "epoch": 3.271812080536913,
303
- "grad_norm": 26.75,
304
- "learning_rate": 1.5718436978167976e-05,
305
- "loss": 0.4474,
306
  "step": 975
307
  },
308
  {
309
- "epoch": 3.3557046979865772,
310
- "grad_norm": 20.25,
311
- "learning_rate": 1.54934077762777e-05,
312
- "loss": 0.4227,
313
  "step": 1000
314
  },
315
  {
316
- "epoch": 3.4395973154362416,
317
- "grad_norm": 18.5,
318
- "learning_rate": 1.526432162877356e-05,
319
- "loss": 0.4062,
 
 
 
 
 
 
 
 
320
  "step": 1025
321
  },
322
  {
323
- "epoch": 3.523489932885906,
324
- "grad_norm": 13.3125,
325
- "learning_rate": 1.5031347718443212e-05,
326
- "loss": 0.4282,
327
  "step": 1050
328
  },
329
  {
330
- "epoch": 3.6073825503355703,
331
- "grad_norm": 12.875,
332
- "learning_rate": 1.4794658099232426e-05,
333
- "loss": 0.451,
334
  "step": 1075
335
  },
336
  {
337
- "epoch": 3.6912751677852347,
338
- "grad_norm": 18.0,
339
- "learning_rate": 1.455442756918126e-05,
340
- "loss": 0.407,
341
  "step": 1100
342
  },
343
  {
344
- "epoch": 3.7751677852348995,
345
- "grad_norm": 22.375,
346
- "learning_rate": 1.4310833541333658e-05,
347
- "loss": 0.4382,
348
  "step": 1125
349
  },
350
  {
351
- "epoch": 3.859060402684564,
352
- "grad_norm": 23.0,
353
- "learning_rate": 1.4064055912715846e-05,
354
- "loss": 0.4321,
355
  "step": 1150
356
  },
357
  {
358
- "epoch": 3.942953020134228,
359
- "grad_norm": 18.375,
360
- "learning_rate": 1.3814276931480308e-05,
361
- "loss": 0.4301,
362
  "step": 1175
363
  },
364
  {
365
- "epoch": 4.0,
366
- "eval_loss": 1.8369454145431519,
367
- "eval_runtime": 1.9833,
368
- "eval_samples_per_second": 63.53,
369
- "eval_steps_per_second": 8.067,
370
- "step": 1192
371
- },
372
- {
373
- "epoch": 4.026845637583893,
374
- "grad_norm": 25.25,
375
- "learning_rate": 1.356168106231337e-05,
376
- "loss": 0.3562,
377
  "step": 1200
378
  },
379
  {
380
- "epoch": 4.110738255033557,
381
- "grad_norm": 18.625,
382
- "learning_rate": 1.3306454850205914e-05,
383
- "loss": 0.2158,
384
  "step": 1225
385
  },
386
  {
387
- "epoch": 4.194630872483222,
388
- "grad_norm": 23.875,
389
- "learning_rate": 1.3048786782687706e-05,
390
- "loss": 0.1972,
391
  "step": 1250
392
  },
393
  {
394
- "epoch": 4.278523489932886,
395
- "grad_norm": 17.0,
396
- "learning_rate": 1.2788867150627163e-05,
397
- "loss": 0.1911,
 
 
 
 
 
 
 
 
398
  "step": 1275
399
  },
400
  {
401
- "epoch": 4.3624161073825505,
402
- "grad_norm": 13.6875,
403
- "learning_rate": 1.2526887907699349e-05,
404
- "loss": 0.2341,
405
  "step": 1300
406
  },
407
  {
408
- "epoch": 4.446308724832215,
409
- "grad_norm": 21.5,
410
- "learning_rate": 1.2263042528625928e-05,
411
- "loss": 0.1984,
412
  "step": 1325
413
  },
414
  {
415
- "epoch": 4.530201342281879,
416
- "grad_norm": 24.25,
417
- "learning_rate": 1.1997525866291842e-05,
418
- "loss": 0.1688,
419
  "step": 1350
420
  },
421
  {
422
- "epoch": 4.614093959731544,
423
- "grad_norm": 23.375,
424
- "learning_rate": 1.1730534007844186e-05,
425
- "loss": 0.1869,
426
  "step": 1375
427
  },
428
  {
429
- "epoch": 4.697986577181208,
430
- "grad_norm": 16.75,
431
- "learning_rate": 1.1462264129879555e-05,
432
- "loss": 0.2089,
433
  "step": 1400
434
  },
435
  {
436
- "epoch": 4.781879194630872,
437
- "grad_norm": 20.25,
438
- "learning_rate": 1.1192914352826849e-05,
439
- "loss": 0.1923,
440
  "step": 1425
441
  },
442
  {
443
- "epoch": 4.865771812080537,
444
- "grad_norm": 46.25,
445
- "learning_rate": 1.092268359463302e-05,
446
- "loss": 0.1786,
447
  "step": 1450
448
  },
449
  {
450
- "epoch": 4.949664429530201,
451
- "grad_norm": 19.25,
452
- "learning_rate": 1.0651771423859845e-05,
453
- "loss": 0.2009,
454
  "step": 1475
455
  },
456
  {
457
- "epoch": 5.0,
458
- "eval_loss": 2.166541337966919,
459
- "eval_runtime": 1.83,
460
- "eval_samples_per_second": 68.853,
461
- "eval_steps_per_second": 8.743,
462
- "step": 1490
463
  },
464
  {
465
- "epoch": 5.033557046979865,
466
- "grad_norm": 13.4375,
467
- "learning_rate": 1.0380377912300231e-05,
468
- "loss": 0.158,
469
- "step": 1500
 
470
  },
471
  {
472
- "epoch": 5.117449664429531,
473
- "grad_norm": 6.5,
474
- "learning_rate": 1.0108703487222855e-05,
475
- "loss": 0.0834,
476
  "step": 1525
477
  },
478
  {
479
- "epoch": 5.201342281879195,
480
- "grad_norm": 14.9375,
481
- "learning_rate": 9.836948783354308e-06,
482
- "loss": 0.0662,
483
  "step": 1550
484
  },
485
  {
486
- "epoch": 5.285234899328859,
487
- "grad_norm": 11.125,
488
- "learning_rate": 9.565314494707995e-06,
489
- "loss": 0.0883,
490
  "step": 1575
491
  },
492
  {
493
- "epoch": 5.369127516778524,
494
- "grad_norm": 15.375,
495
- "learning_rate": 9.294001226369281e-06,
496
- "loss": 0.0734,
497
  "step": 1600
498
  },
499
  {
500
- "epoch": 5.453020134228188,
501
- "grad_norm": 8.75,
502
- "learning_rate": 9.023209346346293e-06,
503
- "loss": 0.079,
504
  "step": 1625
505
  },
506
  {
507
- "epoch": 5.5369127516778525,
508
- "grad_norm": 35.25,
509
- "learning_rate": 8.753138837595818e-06,
510
- "loss": 0.0793,
511
  "step": 1650
512
  },
513
  {
514
- "epoch": 5.620805369127517,
515
- "grad_norm": 14.0,
516
- "learning_rate": 8.483989150333556e-06,
517
- "loss": 0.0793,
518
  "step": 1675
519
  },
520
  {
521
- "epoch": 5.704697986577181,
522
- "grad_norm": 32.5,
523
- "learning_rate": 8.215959054737817e-06,
524
- "loss": 0.0683,
525
  "step": 1700
526
  },
527
  {
528
- "epoch": 5.7885906040268456,
529
- "grad_norm": 8.625,
530
- "learning_rate": 7.94924649415542e-06,
531
- "loss": 0.0788,
532
  "step": 1725
533
  },
534
  {
535
- "epoch": 5.87248322147651,
536
- "grad_norm": 7.8125,
537
- "learning_rate": 7.684048438918247e-06,
538
- "loss": 0.0699,
539
  "step": 1750
540
  },
541
  {
542
- "epoch": 5.956375838926174,
543
- "grad_norm": 7.625,
544
- "learning_rate": 7.420560740878335e-06,
545
- "loss": 0.0988,
546
- "step": 1775
 
547
  },
548
  {
549
- "epoch": 6.0,
550
- "eval_loss": 2.644843578338623,
551
- "eval_runtime": 1.8563,
552
- "eval_samples_per_second": 67.878,
553
- "eval_steps_per_second": 8.619,
554
- "step": 1788
555
  },
556
  {
557
- "epoch": 6.040268456375839,
558
- "grad_norm": 5.03125,
559
- "learning_rate": 7.1589779887690235e-06,
560
- "loss": 0.0675,
561
  "step": 1800
562
  },
563
  {
564
- "epoch": 6.124161073825503,
565
- "grad_norm": 5.9375,
566
- "learning_rate": 6.899493364498884e-06,
567
- "loss": 0.0397,
568
  "step": 1825
569
  },
570
  {
571
- "epoch": 6.208053691275167,
572
- "grad_norm": 11.5625,
573
- "learning_rate": 6.642298500484657e-06,
574
- "loss": 0.0314,
575
  "step": 1850
576
  },
577
  {
578
- "epoch": 6.291946308724833,
579
- "grad_norm": 5.21875,
580
- "learning_rate": 6.387583338128471e-06,
581
- "loss": 0.0401,
582
  "step": 1875
583
  },
584
  {
585
- "epoch": 6.375838926174497,
586
- "grad_norm": 7.5,
587
- "learning_rate": 6.1355359875438995e-06,
588
- "loss": 0.0286,
589
  "step": 1900
590
  },
591
  {
592
- "epoch": 6.459731543624161,
593
- "grad_norm": 8.0,
594
- "learning_rate": 5.886342588634458e-06,
595
- "loss": 0.0374,
596
  "step": 1925
597
  },
598
  {
599
- "epoch": 6.543624161073826,
600
- "grad_norm": 24.5,
601
- "learning_rate": 5.64018717362711e-06,
602
- "loss": 0.0404,
603
  "step": 1950
604
  },
605
  {
606
- "epoch": 6.62751677852349,
607
- "grad_norm": 16.75,
608
- "learning_rate": 5.397251531162332e-06,
609
- "loss": 0.0256,
610
  "step": 1975
611
  },
612
  {
613
- "epoch": 6.7114093959731544,
614
- "grad_norm": 6.0,
615
- "learning_rate": 5.157715072041094e-06,
616
- "loss": 0.037,
617
  "step": 2000
618
  },
619
  {
620
- "epoch": 6.795302013422819,
621
- "grad_norm": 5.9375,
622
- "learning_rate": 4.92175469672787e-06,
623
- "loss": 0.0365,
 
 
 
 
 
 
 
 
624
  "step": 2025
625
  },
626
  {
627
- "epoch": 6.879194630872483,
628
- "grad_norm": 6.28125,
629
- "learning_rate": 4.6895446647076e-06,
630
- "loss": 0.0288,
631
  "step": 2050
632
  },
633
  {
634
- "epoch": 6.9630872483221475,
635
- "grad_norm": 19.375,
636
- "learning_rate": 4.461256465793033e-06,
637
- "loss": 0.0324,
638
  "step": 2075
639
  },
640
  {
641
- "epoch": 7.0,
642
- "eval_loss": 2.95961856842041,
643
- "eval_runtime": 1.9194,
644
- "eval_samples_per_second": 65.646,
645
- "eval_steps_per_second": 8.336,
646
- "step": 2086
647
- },
648
- {
649
- "epoch": 7.046979865771812,
650
- "grad_norm": 2.34375,
651
- "learning_rate": 4.237058693477499e-06,
652
- "loss": 0.0205,
653
  "step": 2100
654
  },
655
  {
656
- "epoch": 7.130872483221476,
657
- "grad_norm": 2.140625,
658
- "learning_rate": 4.017116920426652e-06,
659
- "loss": 0.018,
660
  "step": 2125
661
  },
662
  {
663
- "epoch": 7.214765100671141,
664
- "grad_norm": 3.203125,
665
- "learning_rate": 3.801593576201118e-06,
666
- "loss": 0.022,
667
  "step": 2150
668
  },
669
  {
670
- "epoch": 7.298657718120805,
671
- "grad_norm": 7.28125,
672
- "learning_rate": 3.5906478273004053e-06,
673
- "loss": 0.0222,
674
  "step": 2175
675
  },
676
  {
677
- "epoch": 7.382550335570469,
678
- "grad_norm": 3.375,
679
- "learning_rate": 3.3844354596165364e-06,
680
- "loss": 0.0203,
681
  "step": 2200
682
  },
683
  {
684
- "epoch": 7.466442953020135,
685
- "grad_norm": 7.28125,
686
- "learning_rate": 3.183108763384415e-06,
687
- "loss": 0.0183,
688
  "step": 2225
689
  },
690
  {
691
- "epoch": 7.550335570469799,
692
- "grad_norm": 6.8125,
693
- "learning_rate": 2.986816420713662e-06,
694
- "loss": 0.0174,
695
  "step": 2250
696
  },
697
  {
698
- "epoch": 7.634228187919463,
699
- "grad_norm": 3.34375,
700
- "learning_rate": 2.795703395785184e-06,
701
- "loss": 0.0178,
 
 
 
 
 
 
 
 
702
  "step": 2275
703
  },
704
  {
705
- "epoch": 7.718120805369128,
706
- "grad_norm": 3.53125,
707
- "learning_rate": 2.6099108277934105e-06,
708
- "loss": 0.0186,
709
  "step": 2300
710
  },
711
  {
712
- "epoch": 7.802013422818792,
713
- "grad_norm": 5.28125,
714
- "learning_rate": 2.42957592671337e-06,
715
- "loss": 0.0204,
716
  "step": 2325
717
  },
718
  {
719
- "epoch": 7.885906040268456,
720
- "grad_norm": 2.625,
721
- "learning_rate": 2.2548318719695182e-06,
722
- "loss": 0.0195,
723
  "step": 2350
724
  },
725
  {
726
- "epoch": 7.969798657718121,
727
- "grad_norm": 3.640625,
728
- "learning_rate": 2.085807714081195e-06,
729
- "loss": 0.0182,
730
  "step": 2375
731
  },
732
  {
733
- "epoch": 8.0,
734
- "eval_loss": 3.16278076171875,
735
- "eval_runtime": 1.9451,
736
- "eval_samples_per_second": 64.779,
737
- "eval_steps_per_second": 8.226,
738
- "step": 2384
739
- },
740
- {
741
- "epoch": 8.053691275167786,
742
- "grad_norm": 1.9296875,
743
- "learning_rate": 1.9226282793572927e-06,
744
- "loss": 0.0132,
745
  "step": 2400
746
  },
747
  {
748
- "epoch": 8.13758389261745,
749
- "grad_norm": 1.28125,
750
- "learning_rate": 1.7654140777105954e-06,
751
- "loss": 0.0139,
752
  "step": 2425
753
  },
754
  {
755
- "epoch": 8.221476510067115,
756
- "grad_norm": 1.640625,
757
- "learning_rate": 1.6142812136597852e-06,
758
- "loss": 0.0125,
759
  "step": 2450
760
  },
761
  {
762
- "epoch": 8.305369127516778,
763
- "grad_norm": 1.9375,
764
- "learning_rate": 1.4693413005849143e-06,
765
- "loss": 0.0198,
766
  "step": 2475
767
  },
768
  {
769
- "epoch": 8.389261744966444,
770
- "grad_norm": 2.28125,
771
- "learning_rate": 1.3307013782996237e-06,
772
- "loss": 0.0158,
773
  "step": 2500
774
  },
775
  {
776
- "epoch": 8.473154362416107,
777
- "grad_norm": 4.53125,
778
- "learning_rate": 1.1984638340009935e-06,
779
- "loss": 0.0159,
780
- "step": 2525
781
- },
782
- {
783
- "epoch": 8.557046979865772,
784
- "grad_norm": 3.203125,
785
- "learning_rate": 1.0727263266554012e-06,
786
- "loss": 0.0189,
787
- "step": 2550
788
- },
789
- {
790
- "epoch": 8.640939597315436,
791
- "grad_norm": 4.28125,
792
- "learning_rate": 9.535817148762461e-07,
793
- "loss": 0.0157,
794
- "step": 2575
795
- },
796
- {
797
- "epoch": 8.724832214765101,
798
- "grad_norm": 2.625,
799
- "learning_rate": 8.411179883467668e-07,
800
- "loss": 0.0115,
801
- "step": 2600
802
- },
803
- {
804
- "epoch": 8.808724832214764,
805
- "grad_norm": 2.3125,
806
- "learning_rate": 7.354182028386591e-07,
807
- "loss": 0.0162,
808
- "step": 2625
809
- },
810
- {
811
- "epoch": 8.89261744966443,
812
- "grad_norm": 2.8125,
813
- "learning_rate": 6.365604188743979e-07,
814
- "loss": 0.0178,
815
- "step": 2650
816
- },
817
- {
818
- "epoch": 8.976510067114093,
819
- "grad_norm": 2.4375,
820
- "learning_rate": 5.446176440786488e-07,
821
- "loss": 0.0139,
822
- "step": 2675
823
- },
824
- {
825
- "epoch": 9.0,
826
- "eval_loss": 3.2047908306121826,
827
- "eval_runtime": 1.8248,
828
- "eval_samples_per_second": 69.048,
829
- "eval_steps_per_second": 8.768,
830
- "step": 2682
831
  }
832
  ],
833
  "logging_steps": 25,
834
- "max_steps": 2980,
835
  "num_input_tokens_seen": 0,
836
  "num_train_epochs": 10,
837
  "save_steps": 500,
@@ -842,12 +801,12 @@
842
  "should_evaluate": false,
843
  "should_log": false,
844
  "should_save": true,
845
- "should_training_stop": false
846
  },
847
  "attributes": {}
848
  }
849
  },
850
- "total_flos": 1.0370740667256422e+17,
851
  "train_batch_size": 8,
852
  "trial_name": null,
853
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.580127239227295,
3
+ "best_model_checkpoint": "/home/azureuser/models/grpo/checkpoint-2510",
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 2510,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.099601593625498,
13
+ "grad_norm": 240.0,
14
+ "learning_rate": 1.99203187250996e-06,
15
+ "loss": 17.7499,
16
  "step": 25
17
  },
18
  {
19
+ "epoch": 0.199203187250996,
20
+ "grad_norm": 162.0,
21
+ "learning_rate": 3.98406374501992e-06,
22
+ "loss": 15.9447,
23
  "step": 50
24
  },
25
  {
26
+ "epoch": 0.29880478087649404,
27
+ "grad_norm": 68.5,
28
+ "learning_rate": 5.976095617529881e-06,
29
+ "loss": 10.3904,
30
  "step": 75
31
  },
32
  {
33
+ "epoch": 0.398406374501992,
34
+ "grad_norm": 91.5,
35
+ "learning_rate": 7.96812749003984e-06,
36
+ "loss": 6.0541,
37
  "step": 100
38
  },
39
  {
40
+ "epoch": 0.49800796812749004,
41
+ "grad_norm": 112.0,
42
+ "learning_rate": 9.960159362549801e-06,
43
+ "loss": 3.2563,
44
  "step": 125
45
  },
46
  {
47
+ "epoch": 0.5976095617529881,
48
+ "grad_norm": 48.25,
49
+ "learning_rate": 1.1952191235059762e-05,
50
+ "loss": 2.3041,
51
  "step": 150
52
  },
53
  {
54
+ "epoch": 0.6972111553784861,
55
+ "grad_norm": 15.625,
56
+ "learning_rate": 1.3944223107569724e-05,
57
+ "loss": 2.0339,
58
  "step": 175
59
  },
60
  {
61
+ "epoch": 0.796812749003984,
62
+ "grad_norm": 12.0625,
63
+ "learning_rate": 1.593625498007968e-05,
64
+ "loss": 1.8795,
65
  "step": 200
66
  },
67
  {
68
+ "epoch": 0.896414342629482,
69
+ "grad_norm": 9.8125,
70
+ "learning_rate": 1.7928286852589643e-05,
71
+ "loss": 1.7841,
72
  "step": 225
73
  },
74
  {
75
+ "epoch": 0.9960159362549801,
76
+ "grad_norm": 6.1875,
77
+ "learning_rate": 1.9920318725099602e-05,
78
+ "loss": 1.7073,
79
  "step": 250
80
  },
81
  {
82
+ "epoch": 1.0,
83
+ "eval_loss": 1.6925737857818604,
84
+ "eval_runtime": 7.1264,
85
+ "eval_samples_per_second": 70.443,
86
+ "eval_steps_per_second": 8.84,
87
+ "step": 251
88
  },
89
  {
90
+ "epoch": 1.095617529880478,
91
+ "grad_norm": 5.40625,
92
+ "learning_rate": 1.9994430458382323e-05,
93
+ "loss": 1.669,
94
+ "step": 275
 
95
  },
96
  {
97
+ "epoch": 1.1952191235059761,
98
+ "grad_norm": 2.125,
99
+ "learning_rate": 1.997679073527335e-05,
100
+ "loss": 1.6468,
101
  "step": 300
102
  },
103
  {
104
+ "epoch": 1.294820717131474,
105
+ "grad_norm": 6.5,
106
+ "learning_rate": 1.9947092480832322e-05,
107
+ "loss": 1.6281,
108
  "step": 325
109
  },
110
  {
111
+ "epoch": 1.3944223107569722,
112
+ "grad_norm": 1.3828125,
113
+ "learning_rate": 1.9905371590102157e-05,
114
+ "loss": 1.6223,
115
  "step": 350
116
  },
117
  {
118
+ "epoch": 1.4940239043824701,
119
+ "grad_norm": 1.0546875,
120
+ "learning_rate": 1.98516784893854e-05,
121
+ "loss": 1.6118,
122
  "step": 375
123
  },
124
  {
125
+ "epoch": 1.593625498007968,
126
+ "grad_norm": 3.75,
127
+ "learning_rate": 1.978607807529606e-05,
128
+ "loss": 1.6065,
129
  "step": 400
130
  },
131
  {
132
+ "epoch": 1.6932270916334662,
133
+ "grad_norm": 4.4375,
134
+ "learning_rate": 1.9708649636321745e-05,
135
+ "loss": 1.6031,
136
  "step": 425
137
  },
138
  {
139
+ "epoch": 1.792828685258964,
140
+ "grad_norm": 1.703125,
141
+ "learning_rate": 1.961948675699101e-05,
142
+ "loss": 1.6012,
143
  "step": 450
144
  },
145
  {
146
+ "epoch": 1.8924302788844622,
147
+ "grad_norm": 1.484375,
148
+ "learning_rate": 1.9518697204761604e-05,
149
+ "loss": 1.5998,
150
  "step": 475
151
  },
152
  {
153
+ "epoch": 1.9920318725099602,
154
+ "grad_norm": 0.953125,
155
+ "learning_rate": 1.9406402799766452e-05,
156
+ "loss": 1.5987,
157
  "step": 500
158
  },
159
  {
160
+ "epoch": 2.0,
161
+ "eval_loss": 1.596346139907837,
162
+ "eval_runtime": 7.2817,
163
+ "eval_samples_per_second": 68.94,
164
+ "eval_steps_per_second": 8.652,
165
+ "step": 502
166
+ },
167
+ {
168
+ "epoch": 2.091633466135458,
169
+ "grad_norm": 0.75390625,
170
+ "learning_rate": 1.928273926757472e-05,
171
+ "loss": 1.5935,
172
  "step": 525
173
  },
174
  {
175
+ "epoch": 2.191235059760956,
176
+ "grad_norm": 1.1953125,
177
+ "learning_rate": 1.914785607514599e-05,
178
+ "loss": 1.5895,
179
  "step": 550
180
  },
181
  {
182
+ "epoch": 2.2908366533864544,
183
+ "grad_norm": 0.482421875,
184
+ "learning_rate": 1.9001916250175764e-05,
185
+ "loss": 1.5891,
186
  "step": 575
187
  },
188
  {
189
+ "epoch": 2.3904382470119523,
190
+ "grad_norm": 1.140625,
191
+ "learning_rate": 1.8845096184050684e-05,
192
+ "loss": 1.5868,
 
 
 
 
 
 
 
 
193
  "step": 600
194
  },
195
  {
196
+ "epoch": 2.49003984063745,
197
+ "grad_norm": 1.7265625,
198
+ "learning_rate": 1.86775854186516e-05,
199
+ "loss": 1.587,
200
  "step": 625
201
  },
202
  {
203
+ "epoch": 2.589641434262948,
204
+ "grad_norm": 1.0390625,
205
+ "learning_rate": 1.849958641726221e-05,
206
+ "loss": 1.5856,
207
  "step": 650
208
  },
209
  {
210
+ "epoch": 2.6892430278884465,
211
+ "grad_norm": 2.875,
212
+ "learning_rate": 1.831131431986012e-05,
213
+ "loss": 1.5835,
214
  "step": 675
215
  },
216
  {
217
+ "epoch": 2.7888446215139444,
218
+ "grad_norm": 1.828125,
219
+ "learning_rate": 1.8112996683086102e-05,
220
+ "loss": 1.5826,
221
  "step": 700
222
  },
223
  {
224
+ "epoch": 2.8884462151394423,
225
+ "grad_norm": 0.57421875,
226
+ "learning_rate": 1.7904873205205886e-05,
227
+ "loss": 1.5831,
228
  "step": 725
229
  },
230
  {
231
+ "epoch": 2.9880478087649402,
232
+ "grad_norm": 0.47265625,
233
+ "learning_rate": 1.7687195436396835e-05,
234
+ "loss": 1.5826,
235
  "step": 750
236
  },
237
  {
238
+ "epoch": 3.0,
239
+ "eval_loss": 1.5846781730651855,
240
+ "eval_runtime": 7.5283,
241
+ "eval_samples_per_second": 66.682,
242
+ "eval_steps_per_second": 8.368,
243
+ "step": 753
244
+ },
245
+ {
246
+ "epoch": 3.087649402390438,
247
+ "grad_norm": 0.375,
248
+ "learning_rate": 1.7460226474709726e-05,
249
+ "loss": 1.5809,
250
  "step": 775
251
  },
252
  {
253
+ "epoch": 3.187250996015936,
254
+ "grad_norm": 1.5703125,
255
+ "learning_rate": 1.7224240648073097e-05,
256
+ "loss": 1.5803,
257
  "step": 800
258
  },
259
  {
260
+ "epoch": 3.2868525896414345,
261
+ "grad_norm": 0.85546875,
262
+ "learning_rate": 1.6979523182724514e-05,
263
+ "loss": 1.5802,
264
  "step": 825
265
  },
266
  {
267
+ "epoch": 3.3864541832669324,
268
+ "grad_norm": 0.9765625,
269
+ "learning_rate": 1.672636985846951e-05,
270
+ "loss": 1.5789,
271
  "step": 850
272
  },
273
  {
274
+ "epoch": 3.4860557768924303,
275
+ "grad_norm": 1.4609375,
276
+ "learning_rate": 1.6465086651184826e-05,
277
+ "loss": 1.5788,
278
  "step": 875
279
  },
280
  {
281
+ "epoch": 3.585657370517928,
282
+ "grad_norm": 0.60546875,
283
+ "learning_rate": 1.6195989362998137e-05,
284
+ "loss": 1.5785,
 
 
 
 
 
 
 
 
285
  "step": 900
286
  },
287
  {
288
+ "epoch": 3.685258964143426,
289
+ "grad_norm": 1.0078125,
290
+ "learning_rate": 1.591940324059117e-05,
291
+ "loss": 1.5784,
292
  "step": 925
293
  },
294
  {
295
+ "epoch": 3.7848605577689245,
296
+ "grad_norm": 0.58984375,
297
+ "learning_rate": 1.5635662582087604e-05,
298
+ "loss": 1.5779,
299
  "step": 950
300
  },
301
  {
302
+ "epoch": 3.8844621513944224,
303
+ "grad_norm": 0.455078125,
304
+ "learning_rate": 1.534511033300083e-05,
305
+ "loss": 1.5783,
306
  "step": 975
307
  },
308
  {
309
+ "epoch": 3.9840637450199203,
310
+ "grad_norm": 0.3359375,
311
+ "learning_rate": 1.5048097671730015e-05,
312
+ "loss": 1.5769,
313
  "step": 1000
314
  },
315
  {
316
+ "epoch": 4.0,
317
+ "eval_loss": 1.581375002861023,
318
+ "eval_runtime": 7.1884,
319
+ "eval_samples_per_second": 69.834,
320
+ "eval_steps_per_second": 8.764,
321
+ "step": 1004
322
+ },
323
+ {
324
+ "epoch": 4.083665338645418,
325
+ "grad_norm": 0.365234375,
326
+ "learning_rate": 1.4744983585105388e-05,
327
+ "loss": 1.576,
328
  "step": 1025
329
  },
330
  {
331
+ "epoch": 4.183266932270916,
332
+ "grad_norm": 0.57421875,
333
+ "learning_rate": 1.4436134434495825e-05,
334
+ "loss": 1.5763,
335
  "step": 1050
336
  },
337
  {
338
+ "epoch": 4.282868525896414,
339
+ "grad_norm": 0.93359375,
340
+ "learning_rate": 1.412192351300312e-05,
341
+ "loss": 1.5763,
342
  "step": 1075
343
  },
344
  {
345
+ "epoch": 4.382470119521912,
346
+ "grad_norm": 0.466796875,
347
+ "learning_rate": 1.3802730594278161e-05,
348
+ "loss": 1.5761,
349
  "step": 1100
350
  },
351
  {
352
+ "epoch": 4.482071713147411,
353
+ "grad_norm": 0.5390625,
354
+ "learning_rate": 1.3478941473504346e-05,
355
+ "loss": 1.5761,
356
  "step": 1125
357
  },
358
  {
359
+ "epoch": 4.581673306772909,
360
+ "grad_norm": 0.35546875,
361
+ "learning_rate": 1.315094750110301e-05,
362
+ "loss": 1.5763,
363
  "step": 1150
364
  },
365
  {
366
+ "epoch": 4.681274900398407,
367
+ "grad_norm": 0.287109375,
368
+ "learning_rate": 1.2819145109724476e-05,
369
+ "loss": 1.5755,
370
  "step": 1175
371
  },
372
  {
373
+ "epoch": 4.780876494023905,
374
+ "grad_norm": 0.25390625,
375
+ "learning_rate": 1.2483935335096434e-05,
376
+ "loss": 1.5758,
 
 
 
 
 
 
 
 
377
  "step": 1200
378
  },
379
  {
380
+ "epoch": 4.8804780876494025,
381
+ "grad_norm": 0.298828125,
382
+ "learning_rate": 1.2145723331308752e-05,
383
+ "loss": 1.5758,
384
  "step": 1225
385
  },
386
  {
387
+ "epoch": 4.9800796812749,
388
+ "grad_norm": 0.49609375,
389
+ "learning_rate": 1.1804917881120608e-05,
390
+ "loss": 1.576,
391
  "step": 1250
392
  },
393
  {
394
+ "epoch": 5.0,
395
+ "eval_loss": 1.5807639360427856,
396
+ "eval_runtime": 7.4787,
397
+ "eval_samples_per_second": 67.124,
398
+ "eval_steps_per_second": 8.424,
399
+ "step": 1255
400
+ },
401
+ {
402
+ "epoch": 5.079681274900398,
403
+ "grad_norm": 0.3671875,
404
+ "learning_rate": 1.1461930901881812e-05,
405
+ "loss": 1.5752,
406
  "step": 1275
407
  },
408
  {
409
+ "epoch": 5.179282868525896,
410
+ "grad_norm": 0.482421875,
411
+ "learning_rate": 1.111717694766545e-05,
412
+ "loss": 1.5749,
413
  "step": 1300
414
  },
415
  {
416
+ "epoch": 5.278884462151394,
417
+ "grad_norm": 0.359375,
418
+ "learning_rate": 1.0771072708213652e-05,
419
+ "loss": 1.575,
420
  "step": 1325
421
  },
422
  {
423
+ "epoch": 5.378486055776892,
424
+ "grad_norm": 0.255859375,
425
+ "learning_rate": 1.0424036505302062e-05,
426
+ "loss": 1.5749,
427
  "step": 1350
428
  },
429
  {
430
+ "epoch": 5.47808764940239,
431
+ "grad_norm": 0.36328125,
432
+ "learning_rate": 1.0076487787131726e-05,
433
+ "loss": 1.5751,
434
  "step": 1375
435
  },
436
  {
437
+ "epoch": 5.577689243027889,
438
+ "grad_norm": 0.296875,
439
+ "learning_rate": 9.728846621359538e-06,
440
+ "loss": 1.5757,
441
  "step": 1400
442
  },
443
  {
444
+ "epoch": 5.677290836653387,
445
+ "grad_norm": 0.375,
446
+ "learning_rate": 9.381533187379958e-06,
447
+ "loss": 1.5743,
448
  "step": 1425
449
  },
450
  {
451
+ "epoch": 5.776892430278885,
452
+ "grad_norm": 0.2197265625,
453
+ "learning_rate": 9.034967268471674e-06,
454
+ "loss": 1.5748,
455
  "step": 1450
456
  },
457
  {
458
+ "epoch": 5.876494023904383,
459
+ "grad_norm": 0.421875,
460
+ "learning_rate": 8.68956774442306e-06,
461
+ "loss": 1.5753,
462
  "step": 1475
463
  },
464
  {
465
+ "epoch": 5.9760956175298805,
466
+ "grad_norm": 0.28515625,
467
+ "learning_rate": 8.345752085249603e-06,
468
+ "loss": 1.5741,
469
+ "step": 1500
 
470
  },
471
  {
472
+ "epoch": 6.0,
473
+ "eval_loss": 1.5804367065429688,
474
+ "eval_runtime": 7.4746,
475
+ "eval_samples_per_second": 67.161,
476
+ "eval_steps_per_second": 8.429,
477
+ "step": 1506
478
  },
479
  {
480
+ "epoch": 6.075697211155378,
481
+ "grad_norm": 0.6640625,
482
+ "learning_rate": 8.00393584661531e-06,
483
+ "loss": 1.5739,
484
  "step": 1525
485
  },
486
  {
487
+ "epoch": 6.175298804780876,
488
+ "grad_norm": 0.244140625,
489
+ "learning_rate": 7.664532167567864e-06,
490
+ "loss": 1.5734,
491
  "step": 1550
492
  },
493
  {
494
+ "epoch": 6.274900398406374,
495
+ "grad_norm": 0.33984375,
496
+ "learning_rate": 7.327951271194699e-06,
497
+ "loss": 1.5746,
498
  "step": 1575
499
  },
500
  {
501
+ "epoch": 6.374501992031872,
502
+ "grad_norm": 0.2275390625,
503
+ "learning_rate": 6.994599968803408e-06,
504
+ "loss": 1.5741,
505
  "step": 1600
506
  },
507
  {
508
+ "epoch": 6.474103585657371,
509
+ "grad_norm": 0.36328125,
510
+ "learning_rate": 6.664881168225894e-06,
511
+ "loss": 1.5743,
512
  "step": 1625
513
  },
514
  {
515
+ "epoch": 6.573705179282869,
516
+ "grad_norm": 0.41796875,
517
+ "learning_rate": 6.339193386840445e-06,
518
+ "loss": 1.5739,
519
  "step": 1650
520
  },
521
  {
522
+ "epoch": 6.673306772908367,
523
+ "grad_norm": 0.265625,
524
+ "learning_rate": 6.017930269900377e-06,
525
+ "loss": 1.5755,
526
  "step": 1675
527
  },
528
  {
529
+ "epoch": 6.772908366533865,
530
+ "grad_norm": 0.267578125,
531
+ "learning_rate": 5.701480114751432e-06,
532
+ "loss": 1.574,
533
  "step": 1700
534
  },
535
  {
536
+ "epoch": 6.872509960159363,
537
+ "grad_norm": 0.26171875,
538
+ "learning_rate": 5.390225401512923e-06,
539
+ "loss": 1.5747,
540
  "step": 1725
541
  },
542
  {
543
+ "epoch": 6.972111553784861,
544
+ "grad_norm": 0.314453125,
545
+ "learning_rate": 5.084542330789988e-06,
546
+ "loss": 1.5752,
547
  "step": 1750
548
  },
549
  {
550
+ "epoch": 7.0,
551
+ "eval_loss": 1.5802167654037476,
552
+ "eval_runtime": 7.2108,
553
+ "eval_samples_per_second": 69.617,
554
+ "eval_steps_per_second": 8.737,
555
+ "step": 1757
556
  },
557
  {
558
+ "epoch": 7.0717131474103585,
559
+ "grad_norm": 0.392578125,
560
+ "learning_rate": 4.784800368975557e-06,
561
+ "loss": 1.5742,
562
+ "step": 1775
 
563
  },
564
  {
565
+ "epoch": 7.171314741035856,
566
+ "grad_norm": 0.302734375,
567
+ "learning_rate": 4.491361801691717e-06,
568
+ "loss": 1.5747,
569
  "step": 1800
570
  },
571
  {
572
+ "epoch": 7.270916334661354,
573
+ "grad_norm": 0.25,
574
+ "learning_rate": 4.204581295910207e-06,
575
+ "loss": 1.5749,
576
  "step": 1825
577
  },
578
  {
579
+ "epoch": 7.370517928286852,
580
+ "grad_norm": 0.380859375,
581
+ "learning_rate": 3.924805471281184e-06,
582
+ "loss": 1.574,
583
  "step": 1850
584
  },
585
  {
586
+ "epoch": 7.47011952191235,
587
+ "grad_norm": 0.2080078125,
588
+ "learning_rate": 3.652372481188512e-06,
589
+ "loss": 1.5742,
590
  "step": 1875
591
  },
592
  {
593
+ "epoch": 7.569721115537849,
594
+ "grad_norm": 0.23046875,
595
+ "learning_rate": 3.387611604037848e-06,
596
+ "loss": 1.574,
597
  "step": 1900
598
  },
599
  {
600
+ "epoch": 7.669322709163347,
601
+ "grad_norm": 0.322265625,
602
+ "learning_rate": 3.1308428452715643e-06,
603
+ "loss": 1.5747,
604
  "step": 1925
605
  },
606
  {
607
+ "epoch": 7.768924302788845,
608
+ "grad_norm": 0.1953125,
609
+ "learning_rate": 2.8823765505914827e-06,
610
+ "loss": 1.5743,
611
  "step": 1950
612
  },
613
  {
614
+ "epoch": 7.868525896414343,
615
+ "grad_norm": 0.2275390625,
616
+ "learning_rate": 2.642513030856955e-06,
617
+ "loss": 1.5742,
618
  "step": 1975
619
  },
620
  {
621
+ "epoch": 7.968127490039841,
622
+ "grad_norm": 0.2470703125,
623
+ "learning_rate": 2.4115421991116605e-06,
624
+ "loss": 1.574,
625
  "step": 2000
626
  },
627
  {
628
+ "epoch": 8.0,
629
+ "eval_loss": 1.5801949501037598,
630
+ "eval_runtime": 7.4448,
631
+ "eval_samples_per_second": 67.43,
632
+ "eval_steps_per_second": 8.462,
633
+ "step": 2008
634
+ },
635
+ {
636
+ "epoch": 8.06772908366534,
637
+ "grad_norm": 0.375,
638
+ "learning_rate": 2.1897432201777614e-06,
639
+ "loss": 1.5744,
640
  "step": 2025
641
  },
642
  {
643
+ "epoch": 8.167330677290837,
644
+ "grad_norm": 0.216796875,
645
+ "learning_rate": 1.977384173241027e-06,
646
+ "loss": 1.574,
647
  "step": 2050
648
  },
649
  {
650
+ "epoch": 8.266932270916335,
651
+ "grad_norm": 0.21875,
652
+ "learning_rate": 1.774721727834684e-06,
653
+ "loss": 1.5749,
654
  "step": 2075
655
  },
656
  {
657
+ "epoch": 8.366533864541832,
658
+ "grad_norm": 0.240234375,
659
+ "learning_rate": 1.5820008336136462e-06,
660
+ "loss": 1.5743,
 
 
 
 
 
 
 
 
661
  "step": 2100
662
  },
663
  {
664
+ "epoch": 8.466135458167331,
665
+ "grad_norm": 0.173828125,
666
+ "learning_rate": 1.3994544242940777e-06,
667
+ "loss": 1.5736,
668
  "step": 2125
669
  },
670
  {
671
+ "epoch": 8.565737051792828,
672
+ "grad_norm": 0.275390625,
673
+ "learning_rate": 1.2273031361160958e-06,
674
+ "loss": 1.5739,
675
  "step": 2150
676
  },
677
  {
678
+ "epoch": 8.665338645418327,
679
+ "grad_norm": 0.2265625,
680
+ "learning_rate": 1.0657550411699623e-06,
681
+ "loss": 1.5744,
682
  "step": 2175
683
  },
684
  {
685
+ "epoch": 8.764940239043824,
686
+ "grad_norm": 0.3515625,
687
+ "learning_rate": 9.150053959080008e-07,
688
+ "loss": 1.5743,
689
  "step": 2200
690
  },
691
  {
692
+ "epoch": 8.864541832669323,
693
+ "grad_norm": 0.322265625,
694
+ "learning_rate": 7.75236405146258e-07,
695
+ "loss": 1.5741,
696
  "step": 2225
697
  },
698
  {
699
+ "epoch": 8.964143426294822,
700
+ "grad_norm": 0.1787109375,
701
+ "learning_rate": 6.466170018411422e-07,
702
+ "loss": 1.5743,
703
  "step": 2250
704
  },
705
  {
706
+ "epoch": 9.0,
707
+ "eval_loss": 1.5801681280136108,
708
+ "eval_runtime": 7.6077,
709
+ "eval_samples_per_second": 65.986,
710
+ "eval_steps_per_second": 8.281,
711
+ "step": 2259
712
+ },
713
+ {
714
+ "epoch": 9.063745019920319,
715
+ "grad_norm": 0.19921875,
716
+ "learning_rate": 5.293026429071857e-07,
717
+ "loss": 1.5739,
718
  "step": 2275
719
  },
720
  {
721
+ "epoch": 9.163346613545817,
722
+ "grad_norm": 0.29296875,
723
+ "learning_rate": 4.2343512132276055e-07,
724
+ "loss": 1.5742,
725
  "step": 2300
726
  },
727
  {
728
+ "epoch": 9.262948207171315,
729
+ "grad_norm": 0.275390625,
730
+ "learning_rate": 3.2914239475079655e-07,
731
+ "loss": 1.5749,
732
  "step": 2325
733
  },
734
  {
735
+ "epoch": 9.362549800796813,
736
+ "grad_norm": 0.251953125,
737
+ "learning_rate": 2.4653843088170206e-07,
738
+ "loss": 1.5744,
739
  "step": 2350
740
  },
741
  {
742
+ "epoch": 9.46215139442231,
743
+ "grad_norm": 0.27734375,
744
+ "learning_rate": 1.757230696853518e-07,
745
+ "loss": 1.5739,
746
  "step": 2375
747
  },
748
  {
749
+ "epoch": 9.56175298804781,
750
+ "grad_norm": 0.2138671875,
751
+ "learning_rate": 1.1678190273868073e-07,
752
+ "loss": 1.5741,
 
 
 
 
 
 
 
 
753
  "step": 2400
754
  },
755
  {
756
+ "epoch": 9.661354581673306,
757
+ "grad_norm": 0.251953125,
758
+ "learning_rate": 6.978616977470708e-08,
759
+ "loss": 1.574,
760
  "step": 2425
761
  },
762
  {
763
+ "epoch": 9.760956175298805,
764
+ "grad_norm": 0.466796875,
765
+ "learning_rate": 3.4792672578038974e-08,
766
+ "loss": 1.5745,
767
  "step": 2450
768
  },
769
  {
770
+ "epoch": 9.860557768924302,
771
+ "grad_norm": 0.1923828125,
772
+ "learning_rate": 1.184370633092291e-08,
773
+ "loss": 1.5735,
774
  "step": 2475
775
  },
776
  {
777
+ "epoch": 9.9601593625498,
778
+ "grad_norm": 0.197265625,
779
+ "learning_rate": 9.670084928137258e-10,
780
+ "loss": 1.5736,
781
  "step": 2500
782
  },
783
  {
784
+ "epoch": 10.0,
785
+ "eval_loss": 1.580127239227295,
786
+ "eval_runtime": 9.705,
787
+ "eval_samples_per_second": 51.726,
788
+ "eval_steps_per_second": 6.491,
789
+ "step": 2510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  }
791
  ],
792
  "logging_steps": 25,
793
+ "max_steps": 2510,
794
  "num_input_tokens_seen": 0,
795
  "num_train_epochs": 10,
796
  "save_steps": 500,
 
801
  "should_evaluate": false,
802
  "should_log": false,
803
  "should_save": true,
804
+ "should_training_stop": true
805
  },
806
  "attributes": {}
807
  }
808
  },
809
+ "total_flos": 9.82463661353902e+16,
810
  "train_batch_size": 8,
811
  "trial_name": null,
812
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42d76ebcb9158241976952c09114f0be4f135c1e43ad1f55f4156fb783882cac
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99f0fd7499710736e0323507b1752026d9637c5b6b5a6b6798a0a89f4b7d2dd
3
  size 5368