lamnt2008 commited on
Commit
257057f
·
1 Parent(s): 1e90be7

Upload 8 files

Browse files
Files changed (6) hide show
  1. optimizer.pt +2 -2
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +0 -0
  4. scheduler.pt +0 -0
  5. trainer_state.json +13 -571
  6. training_args.bin +0 -0
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:287dc59e3e3693ecedef27b7647e789ffe910dc1816b59f872757ead0c22601c
3
- size 686885317
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96d1ed6dbda3bbbeecb2dcd41b152da54ba605c70cfdee703aa510c1cb7626ef
3
+ size 686884869
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b142164af3820b81d3e29a0b2a37612e9a0d66c021458d9a317761f90dacc73
3
  size 347183353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8681a6843eaa8b33be91a8e49f1f781a9dc65fcb60237810f5e7f1dab110e3f1
3
  size 347183353
rng_state.pth CHANGED
Binary files a/rng_state.pth and b/rng_state.pth differ
 
scheduler.pt CHANGED
Binary files a/scheduler.pt and b/scheduler.pt differ
 
trainer_state.json CHANGED
@@ -1,598 +1,40 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 22.186642268984446,
5
- "global_step": 97000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
- {
11
- "epoch": 0.23,
12
- "learning_rate": 4.92375724306191e-06,
13
- "loss": 4.617,
14
- "step": 1000
15
- },
16
- {
17
- "epoch": 0.46,
18
- "learning_rate": 4.8475144861238184e-06,
19
- "loss": 4.3463,
20
- "step": 2000
21
- },
22
- {
23
- "epoch": 0.69,
24
- "learning_rate": 4.771271729185728e-06,
25
- "loss": 4.0219,
26
- "step": 3000
27
- },
28
- {
29
- "epoch": 0.91,
30
- "learning_rate": 4.6950289722476364e-06,
31
- "loss": 3.7277,
32
- "step": 4000
33
- },
34
- {
35
- "epoch": 1.14,
36
- "learning_rate": 4.618786215309546e-06,
37
- "loss": 3.4665,
38
- "step": 5000
39
- },
40
- {
41
- "epoch": 1.37,
42
- "learning_rate": 4.542543458371455e-06,
43
- "loss": 3.2523,
44
- "step": 6000
45
- },
46
- {
47
- "epoch": 1.6,
48
- "learning_rate": 4.466300701433364e-06,
49
- "loss": 3.1099,
50
- "step": 7000
51
- },
52
- {
53
- "epoch": 1.83,
54
- "learning_rate": 4.390057944495273e-06,
55
- "loss": 2.9365,
56
- "step": 8000
57
- },
58
- {
59
- "epoch": 2.06,
60
- "learning_rate": 4.313815187557183e-06,
61
- "loss": 2.7993,
62
- "step": 9000
63
- },
64
  {
65
  "epoch": 2.29,
66
- "learning_rate": 4.237572430619091e-06,
67
- "loss": 2.6882,
68
  "step": 10000
69
  },
70
- {
71
- "epoch": 2.52,
72
- "learning_rate": 4.161329673681001e-06,
73
- "loss": 2.5923,
74
- "step": 11000
75
- },
76
- {
77
- "epoch": 2.74,
78
- "learning_rate": 4.085086916742909e-06,
79
- "loss": 2.5037,
80
- "step": 12000
81
- },
82
- {
83
- "epoch": 2.97,
84
- "learning_rate": 4.008844159804819e-06,
85
- "loss": 2.4135,
86
- "step": 13000
87
- },
88
- {
89
- "epoch": 3.2,
90
- "learning_rate": 3.932601402866728e-06,
91
- "loss": 2.3354,
92
- "step": 14000
93
- },
94
- {
95
- "epoch": 3.43,
96
- "learning_rate": 3.856358645928638e-06,
97
- "loss": 2.2609,
98
- "step": 15000
99
- },
100
- {
101
- "epoch": 3.66,
102
- "learning_rate": 3.780115888990546e-06,
103
- "loss": 2.2187,
104
- "step": 16000
105
- },
106
- {
107
- "epoch": 3.89,
108
- "learning_rate": 3.7038731320524552e-06,
109
- "loss": 2.1399,
110
- "step": 17000
111
- },
112
- {
113
- "epoch": 4.12,
114
- "learning_rate": 3.6276303751143642e-06,
115
- "loss": 2.0927,
116
- "step": 18000
117
- },
118
- {
119
- "epoch": 4.35,
120
- "learning_rate": 3.5513876181762737e-06,
121
- "loss": 2.0264,
122
- "step": 19000
123
- },
124
  {
125
  "epoch": 4.57,
126
- "learning_rate": 3.4751448612381827e-06,
127
- "loss": 1.9966,
128
  "step": 20000
129
  },
130
- {
131
- "epoch": 4.8,
132
- "learning_rate": 3.398902104300092e-06,
133
- "loss": 1.9323,
134
- "step": 21000
135
- },
136
- {
137
- "epoch": 5.03,
138
- "learning_rate": 3.322659347362001e-06,
139
- "loss": 1.9327,
140
- "step": 22000
141
- },
142
- {
143
- "epoch": 5.26,
144
- "learning_rate": 3.2464165904239097e-06,
145
- "loss": 1.8627,
146
- "step": 23000
147
- },
148
- {
149
- "epoch": 5.49,
150
- "learning_rate": 3.170173833485819e-06,
151
- "loss": 1.8122,
152
- "step": 24000
153
- },
154
- {
155
- "epoch": 5.72,
156
- "learning_rate": 3.093931076547728e-06,
157
- "loss": 1.8073,
158
- "step": 25000
159
- },
160
- {
161
- "epoch": 5.95,
162
- "learning_rate": 3.0176883196096376e-06,
163
- "loss": 1.7809,
164
- "step": 26000
165
- },
166
- {
167
- "epoch": 6.18,
168
- "learning_rate": 2.9414455626715466e-06,
169
- "loss": 1.7327,
170
- "step": 27000
171
- },
172
- {
173
- "epoch": 6.4,
174
- "learning_rate": 2.8652028057334556e-06,
175
- "loss": 1.6954,
176
- "step": 28000
177
- },
178
- {
179
- "epoch": 6.63,
180
- "learning_rate": 2.7889600487953646e-06,
181
- "loss": 1.6836,
182
- "step": 29000
183
- },
184
  {
185
  "epoch": 6.86,
186
- "learning_rate": 2.7127172918572736e-06,
187
- "loss": 1.6758,
188
  "step": 30000
189
  },
190
- {
191
- "epoch": 7.09,
192
- "learning_rate": 2.6364745349191826e-06,
193
- "loss": 1.6564,
194
- "step": 31000
195
- },
196
- {
197
- "epoch": 7.32,
198
- "learning_rate": 2.560231777981092e-06,
199
- "loss": 1.6141,
200
- "step": 32000
201
- },
202
- {
203
- "epoch": 7.55,
204
- "learning_rate": 2.483989021043001e-06,
205
- "loss": 1.5769,
206
- "step": 33000
207
- },
208
- {
209
- "epoch": 7.78,
210
- "learning_rate": 2.40774626410491e-06,
211
- "loss": 1.6009,
212
- "step": 34000
213
- },
214
- {
215
- "epoch": 8.01,
216
- "learning_rate": 2.3315035071668195e-06,
217
- "loss": 1.581,
218
- "step": 35000
219
- },
220
- {
221
- "epoch": 8.23,
222
- "learning_rate": 2.2552607502287285e-06,
223
- "loss": 1.5363,
224
- "step": 36000
225
- },
226
- {
227
- "epoch": 8.46,
228
- "learning_rate": 2.1790179932906375e-06,
229
- "loss": 1.5425,
230
- "step": 37000
231
- },
232
- {
233
- "epoch": 8.69,
234
- "learning_rate": 2.1027752363525465e-06,
235
- "loss": 1.5053,
236
- "step": 38000
237
- },
238
- {
239
- "epoch": 8.92,
240
- "learning_rate": 2.026532479414456e-06,
241
- "loss": 1.4986,
242
- "step": 39000
243
- },
244
  {
245
  "epoch": 9.15,
246
- "learning_rate": 1.950289722476365e-06,
247
- "loss": 1.4896,
248
  "step": 40000
249
- },
250
- {
251
- "epoch": 9.38,
252
- "learning_rate": 1.874046965538274e-06,
253
- "loss": 1.4829,
254
- "step": 41000
255
- },
256
- {
257
- "epoch": 9.61,
258
- "learning_rate": 1.7978042086001832e-06,
259
- "loss": 1.4634,
260
- "step": 42000
261
- },
262
- {
263
- "epoch": 9.84,
264
- "learning_rate": 1.7215614516620924e-06,
265
- "loss": 1.4559,
266
- "step": 43000
267
- },
268
- {
269
- "epoch": 10.06,
270
- "learning_rate": 1.6453186947240012e-06,
271
- "loss": 1.4388,
272
- "step": 44000
273
- },
274
- {
275
- "epoch": 10.29,
276
- "learning_rate": 1.5690759377859104e-06,
277
- "loss": 1.4299,
278
- "step": 45000
279
- },
280
- {
281
- "epoch": 10.52,
282
- "learning_rate": 1.4928331808478196e-06,
283
- "loss": 1.4328,
284
- "step": 46000
285
- },
286
- {
287
- "epoch": 10.75,
288
- "learning_rate": 1.4165904239097286e-06,
289
- "loss": 1.423,
290
- "step": 47000
291
- },
292
- {
293
- "epoch": 10.98,
294
- "learning_rate": 1.3403476669716379e-06,
295
- "loss": 1.3973,
296
- "step": 48000
297
- },
298
- {
299
- "epoch": 11.21,
300
- "learning_rate": 1.264104910033547e-06,
301
- "loss": 1.4033,
302
- "step": 49000
303
- },
304
- {
305
- "epoch": 11.44,
306
- "learning_rate": 1.187862153095456e-06,
307
- "loss": 1.3921,
308
- "step": 50000
309
- },
310
- {
311
- "epoch": 11.67,
312
- "learning_rate": 1.111619396157365e-06,
313
- "loss": 1.385,
314
- "step": 51000
315
- },
316
- {
317
- "epoch": 11.89,
318
- "learning_rate": 1.035376639219274e-06,
319
- "loss": 1.3807,
320
- "step": 52000
321
- },
322
- {
323
- "epoch": 12.12,
324
- "learning_rate": 9.591338822811833e-07,
325
- "loss": 1.3734,
326
- "step": 53000
327
- },
328
- {
329
- "epoch": 12.35,
330
- "learning_rate": 8.828911253430924e-07,
331
- "loss": 1.3459,
332
- "step": 54000
333
- },
334
- {
335
- "epoch": 12.58,
336
- "learning_rate": 8.066483684050017e-07,
337
- "loss": 1.3675,
338
- "step": 55000
339
- },
340
- {
341
- "epoch": 12.81,
342
- "learning_rate": 7.304056114669108e-07,
343
- "loss": 1.3436,
344
- "step": 56000
345
- },
346
- {
347
- "epoch": 13.04,
348
- "learning_rate": 6.541628545288198e-07,
349
- "loss": 1.3673,
350
- "step": 57000
351
- },
352
- {
353
- "epoch": 13.27,
354
- "learning_rate": 5.779200975907289e-07,
355
- "loss": 1.3417,
356
- "step": 58000
357
- },
358
- {
359
- "epoch": 13.49,
360
- "learning_rate": 2.750838670326319e-06,
361
- "loss": 1.3706,
362
- "step": 59000
363
- },
364
- {
365
- "epoch": 13.72,
366
- "learning_rate": 2.7127172918572736e-06,
367
- "loss": 1.3743,
368
- "step": 60000
369
- },
370
- {
371
- "epoch": 13.95,
372
- "learning_rate": 2.6745959133882283e-06,
373
- "loss": 1.3504,
374
- "step": 61000
375
- },
376
- {
377
- "epoch": 14.18,
378
- "learning_rate": 2.6364745349191826e-06,
379
- "loss": 1.3362,
380
- "step": 62000
381
- },
382
- {
383
- "epoch": 14.41,
384
- "learning_rate": 2.5983531564501373e-06,
385
- "loss": 1.3308,
386
- "step": 63000
387
- },
388
- {
389
- "epoch": 14.64,
390
- "learning_rate": 2.560231777981092e-06,
391
- "loss": 1.3162,
392
- "step": 64000
393
- },
394
- {
395
- "epoch": 14.87,
396
- "learning_rate": 2.5221103995120468e-06,
397
- "loss": 1.2831,
398
- "step": 65000
399
- },
400
- {
401
- "epoch": 15.1,
402
- "learning_rate": 2.483989021043001e-06,
403
- "loss": 1.2969,
404
- "step": 66000
405
- },
406
- {
407
- "epoch": 15.32,
408
- "learning_rate": 2.4458676425739553e-06,
409
- "loss": 1.2785,
410
- "step": 67000
411
- },
412
- {
413
- "epoch": 15.55,
414
- "learning_rate": 2.40774626410491e-06,
415
- "loss": 1.2769,
416
- "step": 68000
417
- },
418
- {
419
- "epoch": 15.78,
420
- "learning_rate": 2.3696248856358648e-06,
421
- "loss": 1.2799,
422
- "step": 69000
423
- },
424
- {
425
- "epoch": 16.01,
426
- "learning_rate": 2.3315035071668195e-06,
427
- "loss": 1.2572,
428
- "step": 70000
429
- },
430
- {
431
- "epoch": 16.24,
432
- "learning_rate": 2.2933821286977738e-06,
433
- "loss": 1.2495,
434
- "step": 71000
435
- },
436
- {
437
- "epoch": 16.47,
438
- "learning_rate": 2.2552607502287285e-06,
439
- "loss": 1.2338,
440
- "step": 72000
441
- },
442
- {
443
- "epoch": 16.7,
444
- "learning_rate": 2.217139371759683e-06,
445
- "loss": 1.2572,
446
- "step": 73000
447
- },
448
- {
449
- "epoch": 16.93,
450
- "learning_rate": 2.1790179932906375e-06,
451
- "loss": 1.2191,
452
- "step": 74000
453
- },
454
- {
455
- "epoch": 17.15,
456
- "learning_rate": 2.1408966148215922e-06,
457
- "loss": 1.2256,
458
- "step": 75000
459
- },
460
- {
461
- "epoch": 17.38,
462
- "learning_rate": 2.1027752363525465e-06,
463
- "loss": 1.2336,
464
- "step": 76000
465
- },
466
- {
467
- "epoch": 17.61,
468
- "learning_rate": 2.0646538578835012e-06,
469
- "loss": 1.2135,
470
- "step": 77000
471
- },
472
- {
473
- "epoch": 17.84,
474
- "learning_rate": 2.026532479414456e-06,
475
- "loss": 1.1806,
476
- "step": 78000
477
- },
478
- {
479
- "epoch": 18.07,
480
- "learning_rate": 1.9884111009454107e-06,
481
- "loss": 1.1971,
482
- "step": 79000
483
- },
484
- {
485
- "epoch": 18.3,
486
- "learning_rate": 1.950289722476365e-06,
487
- "loss": 1.1823,
488
- "step": 80000
489
- },
490
- {
491
- "epoch": 18.53,
492
- "learning_rate": 1.9121683440073192e-06,
493
- "loss": 1.1678,
494
- "step": 81000
495
- },
496
- {
497
- "epoch": 18.76,
498
- "learning_rate": 1.874046965538274e-06,
499
- "loss": 1.1587,
500
- "step": 82000
501
- },
502
- {
503
- "epoch": 18.98,
504
- "learning_rate": 1.8359255870692287e-06,
505
- "loss": 1.179,
506
- "step": 83000
507
- },
508
- {
509
- "epoch": 19.21,
510
- "learning_rate": 1.7978042086001832e-06,
511
- "loss": 1.1547,
512
- "step": 84000
513
- },
514
- {
515
- "epoch": 19.44,
516
- "learning_rate": 1.7596828301311377e-06,
517
- "loss": 1.1558,
518
- "step": 85000
519
- },
520
- {
521
- "epoch": 19.67,
522
- "learning_rate": 1.7215614516620924e-06,
523
- "loss": 1.1634,
524
- "step": 86000
525
- },
526
- {
527
- "epoch": 19.9,
528
- "learning_rate": 1.6834400731930467e-06,
529
- "loss": 1.1421,
530
- "step": 87000
531
- },
532
- {
533
- "epoch": 20.13,
534
- "learning_rate": 1.6453186947240012e-06,
535
- "loss": 1.1497,
536
- "step": 88000
537
- },
538
- {
539
- "epoch": 20.36,
540
- "learning_rate": 1.607197316254956e-06,
541
- "loss": 1.1239,
542
- "step": 89000
543
- },
544
- {
545
- "epoch": 20.59,
546
- "learning_rate": 1.5690759377859104e-06,
547
- "loss": 1.1538,
548
- "step": 90000
549
- },
550
- {
551
- "epoch": 20.81,
552
- "learning_rate": 1.5309545593168651e-06,
553
- "loss": 1.1357,
554
- "step": 91000
555
- },
556
- {
557
- "epoch": 21.04,
558
- "learning_rate": 1.4928331808478196e-06,
559
- "loss": 1.1193,
560
- "step": 92000
561
- },
562
- {
563
- "epoch": 21.27,
564
- "learning_rate": 1.454711802378774e-06,
565
- "loss": 1.1302,
566
- "step": 93000
567
- },
568
- {
569
- "epoch": 21.5,
570
- "learning_rate": 1.4165904239097286e-06,
571
- "loss": 1.0902,
572
- "step": 94000
573
- },
574
- {
575
- "epoch": 21.73,
576
- "learning_rate": 1.3784690454406831e-06,
577
- "loss": 1.1061,
578
- "step": 95000
579
- },
580
- {
581
- "epoch": 21.96,
582
- "learning_rate": 1.3403476669716379e-06,
583
- "loss": 1.1025,
584
- "step": 96000
585
- },
586
- {
587
- "epoch": 22.19,
588
- "learning_rate": 1.3022262885025924e-06,
589
- "loss": 1.1007,
590
- "step": 97000
591
  }
592
  ],
593
- "max_steps": 131160,
594
- "num_train_epochs": 30,
595
- "total_flos": 2.406092262019348e+20,
596
  "trial_name": null,
597
  "trial_params": null
598
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.149130832570906,
5
+ "global_step": 40000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  {
11
  "epoch": 2.29,
12
+ "learning_rate": 9.42817932296432e-06,
13
+ "loss": 2.4808,
14
  "step": 10000
15
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  {
17
  "epoch": 4.57,
18
+ "learning_rate": 8.856358645928637e-06,
19
+ "loss": 1.2166,
20
  "step": 20000
21
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  {
23
  "epoch": 6.86,
24
+ "learning_rate": 8.284537968892956e-06,
25
+ "loss": 0.9323,
26
  "step": 30000
27
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  {
29
  "epoch": 9.15,
30
+ "learning_rate": 7.712717291857275e-06,
31
+ "loss": 0.7901,
32
  "step": 40000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
  ],
35
+ "max_steps": 174880,
36
+ "num_train_epochs": 40,
37
+ "total_flos": 9.92204617219769e+19,
38
  "trial_name": null,
39
  "trial_params": null
40
  }
training_args.bin CHANGED
Binary files a/training_args.bin and b/training_args.bin differ