Fahad-S commited on
Commit
e982c93
·
verified ·
1 Parent(s): ea51088

Upload checkpoint-600/trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. checkpoint-600/trainer_state.json +514 -0
checkpoint-600/trainer_state.json ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.2609637479559113,
6
+ "eval_steps": 500,
7
+ "global_step": 600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.054367447490814874,
14
+ "grad_norm": 2.0497019290924072,
15
+ "learning_rate": 2.2727272727272728e-06,
16
+ "loss": 0.3348,
17
+ "mean_token_accuracy": 0.036216020046958876,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 0.10873489498162975,
22
+ "grad_norm": 3.0014805793762207,
23
+ "learning_rate": 4.5454545454545455e-06,
24
+ "loss": 0.2845,
25
+ "mean_token_accuracy": 0.03815823743243527,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.16310234247244462,
30
+ "grad_norm": 2.813481569290161,
31
+ "learning_rate": 4.998433870444026e-06,
32
+ "loss": 0.2715,
33
+ "mean_token_accuracy": 0.03921769789994869,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.2174697899632595,
38
+ "grad_norm": 1.4481842517852783,
39
+ "learning_rate": 4.992074831939997e-06,
40
+ "loss": 0.2675,
41
+ "mean_token_accuracy": 0.03833619062224898,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 0.2718372374540744,
46
+ "grad_norm": 1.9870078563690186,
47
+ "learning_rate": 4.980837439924479e-06,
48
+ "loss": 0.2632,
49
+ "mean_token_accuracy": 0.039046796466209344,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 0.32620468494488924,
54
+ "grad_norm": 1.224491834640503,
55
+ "learning_rate": 4.9647436921200514e-06,
56
+ "loss": 0.2593,
57
+ "mean_token_accuracy": 0.04083767885422276,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.38057213243570415,
62
+ "grad_norm": 1.2895762920379639,
63
+ "learning_rate": 4.943825092793806e-06,
64
+ "loss": 0.2574,
65
+ "mean_token_accuracy": 0.040451897433194973,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.434939579926519,
70
+ "grad_norm": 1.4011473655700684,
71
+ "learning_rate": 4.91812259108626e-06,
72
+ "loss": 0.2545,
73
+ "mean_token_accuracy": 0.041019549200973414,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.4893070274173339,
78
+ "grad_norm": 1.6724004745483398,
79
+ "learning_rate": 4.887686500851499e-06,
80
+ "loss": 0.2487,
81
+ "mean_token_accuracy": 0.042838540498996734,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.5436744749081488,
86
+ "grad_norm": 1.4560455083847046,
87
+ "learning_rate": 4.852576402165436e-06,
88
+ "loss": 0.2506,
89
+ "mean_token_accuracy": 0.042806161533553676,
90
+ "step": 100
91
+ },
92
+ {
93
+ "epoch": 0.5980419223989636,
94
+ "grad_norm": 1.4531595706939697,
95
+ "learning_rate": 4.812861024695024e-06,
96
+ "loss": 0.2459,
97
+ "mean_token_accuracy": 0.04187717779204832,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.6524093698897785,
102
+ "grad_norm": 1.185799241065979,
103
+ "learning_rate": 4.768618113156695e-06,
104
+ "loss": 0.2475,
105
+ "mean_token_accuracy": 0.04067943783993542,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.7067768173805934,
110
+ "grad_norm": 1.970852017402649,
111
+ "learning_rate": 4.719934275127435e-06,
112
+ "loss": 0.247,
113
+ "mean_token_accuracy": 0.04101934070331481,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.7611442648714083,
118
+ "grad_norm": 1.789883017539978,
119
+ "learning_rate": 4.666904811506382e-06,
120
+ "loss": 0.2442,
121
+ "mean_token_accuracy": 0.04314688319063862,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.8155117123622231,
126
+ "grad_norm": 1.707210659980774,
127
+ "learning_rate": 4.609633529958841e-06,
128
+ "loss": 0.2453,
129
+ "mean_token_accuracy": 0.04226606881111365,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.869879159853038,
134
+ "grad_norm": 1.4981927871704102,
135
+ "learning_rate": 4.5482325417079045e-06,
136
+ "loss": 0.2455,
137
+ "mean_token_accuracy": 0.04208922391790111,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 0.9242466073438529,
142
+ "grad_norm": 1.4780975580215454,
143
+ "learning_rate": 4.482822042071466e-06,
144
+ "loss": 0.242,
145
+ "mean_token_accuracy": 0.043558270454741435,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.9786140548346678,
150
+ "grad_norm": 1.8650013208389282,
151
+ "learning_rate": 4.413530075174245e-06,
152
+ "loss": 0.2445,
153
+ "mean_token_accuracy": 0.04371535095342551,
154
+ "step": 180
155
+ },
156
+ {
157
+ "epoch": 1.0326204684944889,
158
+ "grad_norm": 1.2570964097976685,
159
+ "learning_rate": 4.340492283295396e-06,
160
+ "loss": 0.2383,
161
+ "mean_token_accuracy": 0.04298217990664554,
162
+ "step": 190
163
+ },
164
+ {
165
+ "epoch": 1.0869879159853038,
166
+ "grad_norm": 1.0197902917861938,
167
+ "learning_rate": 4.263851641342383e-06,
168
+ "loss": 0.236,
169
+ "mean_token_accuracy": 0.04432974819592346,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 1.1413553634761187,
174
+ "grad_norm": 1.1943126916885376,
175
+ "learning_rate": 4.1837581769708755e-06,
176
+ "loss": 0.238,
177
+ "mean_token_accuracy": 0.043376582934797625,
178
+ "step": 210
179
+ },
180
+ {
181
+ "epoch": 1.1957228109669336,
182
+ "grad_norm": 1.7524161338806152,
183
+ "learning_rate": 4.100368676898575e-06,
184
+ "loss": 0.2374,
185
+ "mean_token_accuracy": 0.04293899511212658,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 1.2500902584577485,
190
+ "grad_norm": 1.563820719718933,
191
+ "learning_rate": 4.013846379987847e-06,
192
+ "loss": 0.2371,
193
+ "mean_token_accuracy": 0.043539136235267506,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 1.3044577059485634,
198
+ "grad_norm": 1.222346544265747,
199
+ "learning_rate": 3.924360657697987e-06,
200
+ "loss": 0.2388,
201
+ "mean_token_accuracy": 0.043110936877928906,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 1.358825153439378,
206
+ "grad_norm": 1.827516794204712,
207
+ "learning_rate": 3.832086682532633e-06,
208
+ "loss": 0.2366,
209
+ "mean_token_accuracy": 0.04413067772566137,
210
+ "step": 250
211
+ },
212
+ {
213
+ "epoch": 1.4131926009301932,
214
+ "grad_norm": 1.1535050868988037,
215
+ "learning_rate": 3.7372050851313597e-06,
216
+ "loss": 0.2317,
217
+ "mean_token_accuracy": 0.04412070568432682,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 1.4675600484210078,
222
+ "grad_norm": 2.219189405441284,
223
+ "learning_rate": 3.639901600676725e-06,
224
+ "loss": 0.2293,
225
+ "mean_token_accuracy": 0.043295259247861394,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 1.5219274959118227,
230
+ "grad_norm": 1.609122633934021,
231
+ "learning_rate": 3.5403667053089263e-06,
232
+ "loss": 0.2279,
233
+ "mean_token_accuracy": 0.046050211629699335,
234
+ "step": 280
235
+ },
236
+ {
237
+ "epoch": 1.5762949434026376,
238
+ "grad_norm": 1.0812249183654785,
239
+ "learning_rate": 3.4387952432598102e-06,
240
+ "loss": 0.2327,
241
+ "mean_token_accuracy": 0.04309764919016743,
242
+ "step": 290
243
+ },
244
+ {
245
+ "epoch": 1.6306623908934526,
246
+ "grad_norm": 1.4412078857421875,
247
+ "learning_rate": 3.3353860454361398e-06,
248
+ "loss": 0.2345,
249
+ "mean_token_accuracy": 0.042526886053747145,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 1.6850298383842675,
254
+ "grad_norm": 1.4374982118606567,
255
+ "learning_rate": 3.2303415401987543e-06,
256
+ "loss": 0.2326,
257
+ "mean_token_accuracy": 0.042373882613537715,
258
+ "step": 310
259
+ },
260
+ {
261
+ "epoch": 1.7393972858750821,
262
+ "grad_norm": 1.3343548774719238,
263
+ "learning_rate": 3.1238673570995526e-06,
264
+ "loss": 0.2308,
265
+ "mean_token_accuracy": 0.04209477992808388,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 1.7937647333658973,
270
+ "grad_norm": 1.3959535360336304,
271
+ "learning_rate": 3.0161719243519848e-06,
272
+ "loss": 0.2265,
273
+ "mean_token_accuracy": 0.04400216135891242,
274
+ "step": 330
275
+ },
276
+ {
277
+ "epoch": 1.848132180856712,
278
+ "grad_norm": 1.9636735916137695,
279
+ "learning_rate": 2.907466060823037e-06,
280
+ "loss": 0.227,
281
+ "mean_token_accuracy": 0.044824491128019874,
282
+ "step": 340
283
+ },
284
+ {
285
+ "epoch": 1.902499628347527,
286
+ "grad_norm": 1.432839274406433,
287
+ "learning_rate": 2.7979625633454005e-06,
288
+ "loss": 0.2281,
289
+ "mean_token_accuracy": 0.043323908064667196,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 1.9568670758383417,
294
+ "grad_norm": 1.2604482173919678,
295
+ "learning_rate": 2.6878757901576775e-06,
296
+ "loss": 0.2254,
297
+ "mean_token_accuracy": 0.04304394573555328,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 2.010873489498163,
302
+ "grad_norm": 1.0908658504486084,
303
+ "learning_rate": 2.5774212412880636e-06,
304
+ "loss": 0.2256,
305
+ "mean_token_accuracy": 0.04459014333798497,
306
+ "step": 370
307
+ },
308
+ {
309
+ "epoch": 2.0652409369889777,
310
+ "grad_norm": 1.6308084726333618,
311
+ "learning_rate": 2.4668151367029235e-06,
312
+ "loss": 0.2245,
313
+ "mean_token_accuracy": 0.044322372208444,
314
+ "step": 380
315
+ },
316
+ {
317
+ "epoch": 2.119608384479793,
318
+ "grad_norm": 1.3369026184082031,
319
+ "learning_rate": 2.35627399304605e-06,
320
+ "loss": 0.2188,
321
+ "mean_token_accuracy": 0.0456739331535573,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 2.1739758319706075,
326
+ "grad_norm": 1.295975685119629,
327
+ "learning_rate": 2.2460141997971695e-06,
328
+ "loss": 0.2193,
329
+ "mean_token_accuracy": 0.04523389639671223,
330
+ "step": 400
331
+ },
332
+ {
333
+ "epoch": 2.2283432794614226,
334
+ "grad_norm": 1.357338309288025,
335
+ "learning_rate": 2.1362515956793717e-06,
336
+ "loss": 0.2192,
337
+ "mean_token_accuracy": 0.04524684594507562,
338
+ "step": 410
339
+ },
340
+ {
341
+ "epoch": 2.2827107269522373,
342
+ "grad_norm": 1.4117406606674194,
343
+ "learning_rate": 2.027201046144677e-06,
344
+ "loss": 0.2191,
345
+ "mean_token_accuracy": 0.044970535605898476,
346
+ "step": 420
347
+ },
348
+ {
349
+ "epoch": 2.337078174443052,
350
+ "grad_norm": 1.1702640056610107,
351
+ "learning_rate": 1.9190760227648183e-06,
352
+ "loss": 0.2199,
353
+ "mean_token_accuracy": 0.04270703578586108,
354
+ "step": 430
355
+ },
356
+ {
357
+ "epoch": 2.391445621933867,
358
+ "grad_norm": 1.1582852602005005,
359
+ "learning_rate": 1.8120881853506179e-06,
360
+ "loss": 0.2177,
361
+ "mean_token_accuracy": 0.04548128471906239,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 2.445813069424682,
366
+ "grad_norm": 1.424065351486206,
367
+ "learning_rate": 1.7064469676179682e-06,
368
+ "loss": 0.219,
369
+ "mean_token_accuracy": 0.04496145398898079,
370
+ "step": 450
371
+ },
372
+ {
373
+ "epoch": 2.500180516915497,
374
+ "grad_norm": 1.851121425628662,
375
+ "learning_rate": 1.6023591672114992e-06,
376
+ "loss": 0.2163,
377
+ "mean_token_accuracy": 0.04483195860047999,
378
+ "step": 460
379
+ },
380
+ {
381
+ "epoch": 2.5545479644063116,
382
+ "grad_norm": 1.0778027772903442,
383
+ "learning_rate": 1.5000285408884734e-06,
384
+ "loss": 0.2163,
385
+ "mean_token_accuracy": 0.04510914630936895,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 2.6089154118971267,
390
+ "grad_norm": 1.2224993705749512,
391
+ "learning_rate": 1.3996554056553723e-06,
392
+ "loss": 0.2141,
393
+ "mean_token_accuracy": 0.04724993375784834,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 2.6632828593879414,
398
+ "grad_norm": 1.496194839477539,
399
+ "learning_rate": 1.3014362466379407e-06,
400
+ "loss": 0.2193,
401
+ "mean_token_accuracy": 0.045281845046338275,
402
+ "step": 490
403
+ },
404
+ {
405
+ "epoch": 2.717650306878756,
406
+ "grad_norm": 1.2077394723892212,
407
+ "learning_rate": 1.2055633324523324e-06,
408
+ "loss": 0.2149,
409
+ "mean_token_accuracy": 0.044335345691433756,
410
+ "step": 500
411
+ },
412
+ {
413
+ "epoch": 2.772017754369571,
414
+ "grad_norm": 1.171938180923462,
415
+ "learning_rate": 1.1122243388302622e-06,
416
+ "loss": 0.2174,
417
+ "mean_token_accuracy": 0.04475436422835628,
418
+ "step": 510
419
+ },
420
+ {
421
+ "epoch": 2.8263852018603863,
422
+ "grad_norm": 1.2405449151992798,
423
+ "learning_rate": 1.0216019812349508e-06,
424
+ "loss": 0.2163,
425
+ "mean_token_accuracy": 0.04440462300572108,
426
+ "step": 520
427
+ },
428
+ {
429
+ "epoch": 2.880752649351201,
430
+ "grad_norm": 1.1590036153793335,
431
+ "learning_rate": 9.338736571870205e-07,
432
+ "loss": 0.2153,
433
+ "mean_token_accuracy": 0.043993269855855034,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 2.9351200968420157,
438
+ "grad_norm": 1.6510144472122192,
439
+ "learning_rate": 8.492110990005228e-07,
440
+ "loss": 0.2127,
441
+ "mean_token_accuracy": 0.045991620956010594,
442
+ "step": 540
443
+ },
444
+ {
445
+ "epoch": 2.989487544332831,
446
+ "grad_norm": 1.291462779045105,
447
+ "learning_rate": 7.677800376088657e-07,
448
+ "loss": 0.2109,
449
+ "mean_token_accuracy": 0.045755728109907065,
450
+ "step": 550
451
+ },
452
+ {
453
+ "epoch": 3.043493957992652,
454
+ "grad_norm": 1.295513391494751,
455
+ "learning_rate": 6.897398781387299e-07,
456
+ "loss": 0.209,
457
+ "mean_token_accuracy": 0.04398122681887118,
458
+ "step": 560
459
+ },
460
+ {
461
+ "epoch": 3.097861405483467,
462
+ "grad_norm": 1.6088683605194092,
463
+ "learning_rate": 6.152433878670485e-07,
464
+ "loss": 0.2053,
465
+ "mean_token_accuracy": 0.04704667659716506,
466
+ "step": 570
467
+ },
468
+ {
469
+ "epoch": 3.1522288529742815,
470
+ "grad_norm": 1.117344617843628,
471
+ "learning_rate": 5.444363971718875e-07,
472
+ "loss": 0.2127,
473
+ "mean_token_accuracy": 0.04461234014015645,
474
+ "step": 580
475
+ },
476
+ {
477
+ "epoch": 3.2065963004650966,
478
+ "grad_norm": 1.4021047353744507,
479
+ "learning_rate": 4.774575140626317e-07,
480
+ "loss": 0.2132,
481
+ "mean_token_accuracy": 0.044966605161243935,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 3.2609637479559113,
486
+ "grad_norm": 1.2901127338409424,
487
+ "learning_rate": 4.144378528483009e-07,
488
+ "loss": 0.2096,
489
+ "mean_token_accuracy": 0.04288893376269698,
490
+ "step": 600
491
+ }
492
+ ],
493
+ "logging_steps": 10,
494
+ "max_steps": 732,
495
+ "num_input_tokens_seen": 0,
496
+ "num_train_epochs": 4,
497
+ "save_steps": 300,
498
+ "stateful_callbacks": {
499
+ "TrainerControl": {
500
+ "args": {
501
+ "should_epoch_stop": false,
502
+ "should_evaluate": false,
503
+ "should_log": false,
504
+ "should_save": true,
505
+ "should_training_stop": false
506
+ },
507
+ "attributes": {}
508
+ }
509
+ },
510
+ "total_flos": 8.169216325675168e+18,
511
+ "train_batch_size": 1,
512
+ "trial_name": null,
513
+ "trial_params": null
514
+ }