mgh6 commited on
Commit
d1b1664
·
verified ·
1 Parent(s): b0a5dcb

Training in progress, step 2, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "tattabio/gLM2_650M",
3
  "architectures": [
4
  "gLM2ForMaskedLM"
5
  ],
@@ -8,14 +8,14 @@
8
  "AutoModel": "modeling_glm2.gLM2Model",
9
  "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
10
  },
11
- "depth": 33,
12
- "dim": 1280,
13
  "ffn_dim_multiplier": null,
14
- "heads": 20,
15
  "model_type": "gLM2",
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
- "transformers_version": "4.45.2",
20
  "vocab_size": 37
21
  }
 
1
  {
2
+ "_name_or_path": "tattabio/gLM2_150M",
3
  "architectures": [
4
  "gLM2ForMaskedLM"
5
  ],
 
8
  "AutoModel": "modeling_glm2.gLM2Model",
9
  "AutoModelForMaskedLM": "modeling_glm2.gLM2ForMaskedLM"
10
  },
11
+ "depth": 30,
12
+ "dim": 640,
13
  "ffn_dim_multiplier": null,
14
+ "heads": 10,
15
  "model_type": "gLM2",
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
+ "transformers_version": "4.46.0",
20
  "vocab_size": 37
21
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9eae45eb43651c4ce612c5b264270a3ccdfbc48e1be2784320e0059c614c3cab
3
- size 2682482800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:516558ed7782de66fc542438abb1c93e159afd70a2aeb6571ce83cca423452b0
3
+ size 609855088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36970303513d3e205403c36051106bf22e33ef86f3a1e71a2f1e2cba961b8110
3
- size 5365108834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:812c91eacfd5aea68d8b5decb8b50302d3944860c0aa6ecd636549bd4f072a92
3
+ size 1219840058
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd261f24a3b802018daa7344b83f247d318502ad53b463783d522e7fc68f088e
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:214df0e2d0c96471516754f237b8e237791d4cac9a44207b49ae1586ecbb810a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59630a3df2ec5543c18897bf2cb0562e6bac8d472d75091b8f7ddabcb069715a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,859 +1,33 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.46051392874746205,
5
- "eval_steps": 500,
6
- "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.003837616072895517,
13
- "grad_norm": 0.660070538520813,
14
- "learning_rate": 9.961621123733497e-05,
15
- "loss": 1.2329,
16
- "step": 50
17
  },
18
  {
19
- "epoch": 0.007675232145791034,
20
- "grad_norm": 0.795362114906311,
21
- "learning_rate": 9.923242247466995e-05,
22
- "loss": 1.1816,
23
- "step": 100
24
- },
25
- {
26
- "epoch": 0.011512848218686552,
27
- "grad_norm": 0.9296917915344238,
28
- "learning_rate": 9.884863371200491e-05,
29
- "loss": 1.1603,
30
- "step": 150
31
- },
32
- {
33
- "epoch": 0.015350464291582069,
34
- "grad_norm": 0.6324566006660461,
35
- "learning_rate": 9.846484494933989e-05,
36
- "loss": 1.1505,
37
- "step": 200
38
- },
39
- {
40
- "epoch": 0.019188080364477587,
41
- "grad_norm": 0.8173375725746155,
42
- "learning_rate": 9.808105618667486e-05,
43
- "loss": 1.1454,
44
- "step": 250
45
- },
46
- {
47
- "epoch": 0.023025696437373105,
48
- "grad_norm": 0.6401157975196838,
49
- "learning_rate": 9.769726742400983e-05,
50
- "loss": 1.1346,
51
- "step": 300
52
- },
53
- {
54
- "epoch": 0.026863312510268623,
55
- "grad_norm": 0.6105799674987793,
56
- "learning_rate": 9.731347866134481e-05,
57
- "loss": 1.1263,
58
- "step": 350
59
- },
60
- {
61
- "epoch": 0.030700928583164137,
62
- "grad_norm": 0.3902953863143921,
63
- "learning_rate": 9.692968989867978e-05,
64
- "loss": 1.1201,
65
- "step": 400
66
- },
67
- {
68
- "epoch": 0.034538544656059655,
69
- "grad_norm": 0.8294398188591003,
70
- "learning_rate": 9.654590113601474e-05,
71
- "loss": 1.1123,
72
- "step": 450
73
- },
74
- {
75
- "epoch": 0.03837616072895517,
76
- "grad_norm": 0.7488604187965393,
77
- "learning_rate": 9.616211237334971e-05,
78
- "loss": 1.1099,
79
- "step": 500
80
- },
81
- {
82
- "epoch": 0.04221377680185069,
83
- "grad_norm": 0.32585829496383667,
84
- "learning_rate": 9.577832361068468e-05,
85
- "loss": 1.1056,
86
- "step": 550
87
- },
88
- {
89
- "epoch": 0.04605139287474621,
90
- "grad_norm": 0.48506152629852295,
91
- "learning_rate": 9.539453484801966e-05,
92
- "loss": 1.1021,
93
- "step": 600
94
- },
95
- {
96
- "epoch": 0.04988900894764173,
97
- "grad_norm": 0.8694831132888794,
98
- "learning_rate": 9.501074608535463e-05,
99
- "loss": 1.0931,
100
- "step": 650
101
- },
102
- {
103
- "epoch": 0.053726625020537246,
104
- "grad_norm": 1.0110464096069336,
105
- "learning_rate": 9.462695732268958e-05,
106
- "loss": 1.0903,
107
- "step": 700
108
- },
109
- {
110
- "epoch": 0.05756424109343276,
111
- "grad_norm": 0.5741580128669739,
112
- "learning_rate": 9.424316856002457e-05,
113
- "loss": 1.0871,
114
- "step": 750
115
- },
116
- {
117
- "epoch": 0.061401857166328275,
118
- "grad_norm": 1.2655494213104248,
119
- "learning_rate": 9.385937979735953e-05,
120
- "loss": 1.0825,
121
- "step": 800
122
- },
123
- {
124
- "epoch": 0.0652394732392238,
125
- "grad_norm": 0.6292811036109924,
126
- "learning_rate": 9.347559103469452e-05,
127
- "loss": 1.0787,
128
- "step": 850
129
- },
130
- {
131
- "epoch": 0.06907708931211931,
132
- "grad_norm": 0.6818861961364746,
133
- "learning_rate": 9.309180227202948e-05,
134
- "loss": 1.0738,
135
- "step": 900
136
- },
137
- {
138
- "epoch": 0.07291470538501482,
139
- "grad_norm": 0.5532193183898926,
140
- "learning_rate": 9.270801350936445e-05,
141
- "loss": 1.0714,
142
- "step": 950
143
- },
144
- {
145
- "epoch": 0.07675232145791035,
146
- "grad_norm": 0.6174580454826355,
147
- "learning_rate": 9.232422474669942e-05,
148
- "loss": 1.0674,
149
- "step": 1000
150
- },
151
- {
152
- "epoch": 0.08058993753080586,
153
- "grad_norm": 0.44129157066345215,
154
- "learning_rate": 9.194043598403439e-05,
155
- "loss": 1.0663,
156
- "step": 1050
157
- },
158
- {
159
- "epoch": 0.08442755360370138,
160
- "grad_norm": 0.6179072856903076,
161
- "learning_rate": 9.155664722136937e-05,
162
- "loss": 1.0639,
163
- "step": 1100
164
- },
165
- {
166
- "epoch": 0.0882651696765969,
167
- "grad_norm": 0.847963273525238,
168
- "learning_rate": 9.117285845870434e-05,
169
- "loss": 1.0601,
170
- "step": 1150
171
- },
172
- {
173
- "epoch": 0.09210278574949242,
174
- "grad_norm": 0.8802704215049744,
175
- "learning_rate": 9.07890696960393e-05,
176
- "loss": 1.0536,
177
- "step": 1200
178
- },
179
- {
180
- "epoch": 0.09594040182238793,
181
- "grad_norm": 0.837968111038208,
182
- "learning_rate": 9.040528093337427e-05,
183
- "loss": 1.0525,
184
- "step": 1250
185
- },
186
- {
187
- "epoch": 0.09977801789528346,
188
- "grad_norm": 0.8432613611221313,
189
- "learning_rate": 9.002149217070924e-05,
190
- "loss": 1.0482,
191
- "step": 1300
192
- },
193
- {
194
- "epoch": 0.10361563396817897,
195
- "grad_norm": 0.5921217799186707,
196
- "learning_rate": 8.963770340804421e-05,
197
- "loss": 1.0449,
198
- "step": 1350
199
- },
200
- {
201
- "epoch": 0.10745325004107449,
202
- "grad_norm": 0.45630180835723877,
203
- "learning_rate": 8.925391464537919e-05,
204
- "loss": 1.0416,
205
- "step": 1400
206
- },
207
- {
208
- "epoch": 0.11129086611397,
209
- "grad_norm": 0.45162612199783325,
210
- "learning_rate": 8.887012588271416e-05,
211
- "loss": 1.0384,
212
- "step": 1450
213
- },
214
- {
215
- "epoch": 0.11512848218686551,
216
- "grad_norm": 0.9114300012588501,
217
- "learning_rate": 8.848633712004914e-05,
218
- "loss": 1.0355,
219
- "step": 1500
220
- },
221
- {
222
- "epoch": 0.11896609825976104,
223
- "grad_norm": 1.0031269788742065,
224
- "learning_rate": 8.81025483573841e-05,
225
- "loss": 1.0336,
226
- "step": 1550
227
- },
228
- {
229
- "epoch": 0.12280371433265655,
230
- "grad_norm": 0.6353164911270142,
231
- "learning_rate": 8.771875959471906e-05,
232
- "loss": 1.0284,
233
- "step": 1600
234
- },
235
- {
236
- "epoch": 0.12664133040555206,
237
- "grad_norm": 0.5234444737434387,
238
- "learning_rate": 8.733497083205404e-05,
239
- "loss": 1.0277,
240
- "step": 1650
241
- },
242
- {
243
- "epoch": 0.1304789464784476,
244
- "grad_norm": 1.0591809749603271,
245
- "learning_rate": 8.695118206938901e-05,
246
- "loss": 1.0262,
247
- "step": 1700
248
- },
249
- {
250
- "epoch": 0.1343165625513431,
251
- "grad_norm": 0.3141140341758728,
252
- "learning_rate": 8.656739330672398e-05,
253
- "loss": 1.0253,
254
- "step": 1750
255
- },
256
- {
257
- "epoch": 0.13815417862423862,
258
- "grad_norm": 0.6443182229995728,
259
- "learning_rate": 8.618360454405895e-05,
260
- "loss": 1.0202,
261
- "step": 1800
262
- },
263
- {
264
- "epoch": 0.14199179469713413,
265
- "grad_norm": 0.7155106067657471,
266
- "learning_rate": 8.579981578139392e-05,
267
- "loss": 1.0202,
268
- "step": 1850
269
- },
270
- {
271
- "epoch": 0.14582941077002964,
272
- "grad_norm": 0.7634809017181396,
273
- "learning_rate": 8.54160270187289e-05,
274
- "loss": 1.0165,
275
- "step": 1900
276
- },
277
- {
278
- "epoch": 0.14966702684292518,
279
- "grad_norm": 0.4034004509449005,
280
- "learning_rate": 8.503223825606387e-05,
281
- "loss": 1.0157,
282
- "step": 1950
283
- },
284
- {
285
- "epoch": 0.1535046429158207,
286
- "grad_norm": 1.0218342542648315,
287
- "learning_rate": 8.464844949339883e-05,
288
- "loss": 1.0122,
289
- "step": 2000
290
- },
291
- {
292
- "epoch": 0.1573422589887162,
293
- "grad_norm": 0.9840885996818542,
294
- "learning_rate": 8.426466073073382e-05,
295
- "loss": 1.0112,
296
- "step": 2050
297
- },
298
- {
299
- "epoch": 0.16117987506161172,
300
- "grad_norm": 0.6855255961418152,
301
- "learning_rate": 8.388087196806877e-05,
302
- "loss": 1.007,
303
- "step": 2100
304
- },
305
- {
306
- "epoch": 0.16501749113450725,
307
- "grad_norm": 1.3098440170288086,
308
- "learning_rate": 8.349708320540375e-05,
309
- "loss": 1.0063,
310
- "step": 2150
311
- },
312
- {
313
- "epoch": 0.16885510720740277,
314
- "grad_norm": 0.7584977149963379,
315
- "learning_rate": 8.311329444273872e-05,
316
- "loss": 1.0068,
317
- "step": 2200
318
- },
319
- {
320
- "epoch": 0.17269272328029828,
321
- "grad_norm": 0.5157095193862915,
322
- "learning_rate": 8.272950568007369e-05,
323
- "loss": 1.0043,
324
- "step": 2250
325
- },
326
- {
327
- "epoch": 0.1765303393531938,
328
- "grad_norm": 0.78426194190979,
329
- "learning_rate": 8.234571691740867e-05,
330
- "loss": 0.9992,
331
- "step": 2300
332
- },
333
- {
334
- "epoch": 0.18036795542608933,
335
- "grad_norm": 0.729180097579956,
336
- "learning_rate": 8.196192815474362e-05,
337
- "loss": 0.9974,
338
- "step": 2350
339
- },
340
- {
341
- "epoch": 0.18420557149898484,
342
- "grad_norm": 0.8251708149909973,
343
- "learning_rate": 8.15781393920786e-05,
344
- "loss": 0.9944,
345
- "step": 2400
346
- },
347
- {
348
- "epoch": 0.18804318757188035,
349
- "grad_norm": 0.4832938015460968,
350
- "learning_rate": 8.119435062941357e-05,
351
- "loss": 0.9959,
352
- "step": 2450
353
- },
354
- {
355
- "epoch": 0.19188080364477586,
356
- "grad_norm": 1.06001877784729,
357
- "learning_rate": 8.081056186674854e-05,
358
- "loss": 0.9944,
359
- "step": 2500
360
- },
361
- {
362
- "epoch": 0.19571841971767137,
363
- "grad_norm": 0.8641451597213745,
364
- "learning_rate": 8.042677310408352e-05,
365
- "loss": 0.989,
366
- "step": 2550
367
- },
368
- {
369
- "epoch": 0.1995560357905669,
370
- "grad_norm": 0.3354558050632477,
371
- "learning_rate": 8.004298434141849e-05,
372
- "loss": 0.9891,
373
- "step": 2600
374
- },
375
- {
376
- "epoch": 0.20339365186346242,
377
- "grad_norm": 0.26871201395988464,
378
- "learning_rate": 7.965919557875346e-05,
379
- "loss": 0.9845,
380
- "step": 2650
381
- },
382
- {
383
- "epoch": 0.20723126793635793,
384
- "grad_norm": 0.5861389636993408,
385
- "learning_rate": 7.927540681608843e-05,
386
- "loss": 0.982,
387
- "step": 2700
388
- },
389
- {
390
- "epoch": 0.21106888400925344,
391
- "grad_norm": 0.3075869679450989,
392
- "learning_rate": 7.88916180534234e-05,
393
- "loss": 0.9814,
394
- "step": 2750
395
- },
396
- {
397
- "epoch": 0.21490650008214898,
398
- "grad_norm": 0.3018185496330261,
399
- "learning_rate": 7.850782929075838e-05,
400
- "loss": 0.9814,
401
- "step": 2800
402
- },
403
- {
404
- "epoch": 0.2187441161550445,
405
- "grad_norm": 0.7530673742294312,
406
- "learning_rate": 7.812404052809335e-05,
407
- "loss": 0.9807,
408
- "step": 2850
409
- },
410
- {
411
- "epoch": 0.22258173222794,
412
- "grad_norm": 0.8647517561912537,
413
- "learning_rate": 7.774025176542831e-05,
414
- "loss": 0.9749,
415
- "step": 2900
416
- },
417
- {
418
- "epoch": 0.22641934830083552,
419
- "grad_norm": 0.3062540888786316,
420
- "learning_rate": 7.735646300276328e-05,
421
- "loss": 0.9761,
422
- "step": 2950
423
- },
424
- {
425
- "epoch": 0.23025696437373103,
426
- "grad_norm": 0.8063308596611023,
427
- "learning_rate": 7.697267424009825e-05,
428
- "loss": 0.9705,
429
- "step": 3000
430
- },
431
- {
432
- "epoch": 0.23409458044662657,
433
- "grad_norm": 1.2951929569244385,
434
- "learning_rate": 7.658888547743323e-05,
435
- "loss": 0.9741,
436
- "step": 3050
437
- },
438
- {
439
- "epoch": 0.23793219651952208,
440
- "grad_norm": 0.57526695728302,
441
- "learning_rate": 7.62050967147682e-05,
442
- "loss": 0.9667,
443
- "step": 3100
444
- },
445
- {
446
- "epoch": 0.2417698125924176,
447
- "grad_norm": 0.21630573272705078,
448
- "learning_rate": 7.582130795210317e-05,
449
- "loss": 0.9658,
450
- "step": 3150
451
- },
452
- {
453
- "epoch": 0.2456074286653131,
454
- "grad_norm": 0.44468560814857483,
455
- "learning_rate": 7.543751918943814e-05,
456
- "loss": 0.9652,
457
- "step": 3200
458
- },
459
- {
460
- "epoch": 0.24944504473820864,
461
- "grad_norm": 0.484546959400177,
462
- "learning_rate": 7.50537304267731e-05,
463
- "loss": 0.9632,
464
- "step": 3250
465
- },
466
- {
467
- "epoch": 0.2532826608111041,
468
- "grad_norm": 0.6433889865875244,
469
- "learning_rate": 7.466994166410808e-05,
470
- "loss": 0.9623,
471
- "step": 3300
472
- },
473
- {
474
- "epoch": 0.25712027688399963,
475
- "grad_norm": 0.43382543325424194,
476
- "learning_rate": 7.428615290144305e-05,
477
- "loss": 0.9607,
478
- "step": 3350
479
- },
480
- {
481
- "epoch": 0.2609578929568952,
482
- "grad_norm": 0.524282693862915,
483
- "learning_rate": 7.390236413877802e-05,
484
- "loss": 0.9604,
485
- "step": 3400
486
- },
487
- {
488
- "epoch": 0.2647955090297907,
489
- "grad_norm": 0.8118641972541809,
490
- "learning_rate": 7.351857537611299e-05,
491
- "loss": 0.9571,
492
- "step": 3450
493
- },
494
- {
495
- "epoch": 0.2686331251026862,
496
- "grad_norm": 0.7356590628623962,
497
- "learning_rate": 7.313478661344796e-05,
498
- "loss": 0.9535,
499
- "step": 3500
500
- },
501
- {
502
- "epoch": 0.27247074117558173,
503
- "grad_norm": 0.27771323919296265,
504
- "learning_rate": 7.275099785078293e-05,
505
- "loss": 0.9557,
506
- "step": 3550
507
- },
508
- {
509
- "epoch": 0.27630835724847724,
510
- "grad_norm": 0.5177915692329407,
511
- "learning_rate": 7.23672090881179e-05,
512
- "loss": 0.9542,
513
- "step": 3600
514
- },
515
- {
516
- "epoch": 0.28014597332137275,
517
- "grad_norm": 0.3106517195701599,
518
- "learning_rate": 7.198342032545287e-05,
519
- "loss": 0.9538,
520
- "step": 3650
521
- },
522
- {
523
- "epoch": 0.28398358939426827,
524
- "grad_norm": 0.4957466125488281,
525
- "learning_rate": 7.159963156278786e-05,
526
- "loss": 0.9513,
527
- "step": 3700
528
- },
529
- {
530
- "epoch": 0.2878212054671638,
531
- "grad_norm": 0.5112007856369019,
532
- "learning_rate": 7.121584280012281e-05,
533
- "loss": 0.949,
534
- "step": 3750
535
- },
536
- {
537
- "epoch": 0.2916588215400593,
538
- "grad_norm": 0.6187167167663574,
539
- "learning_rate": 7.083205403745778e-05,
540
- "loss": 0.9477,
541
- "step": 3800
542
- },
543
- {
544
- "epoch": 0.29549643761295485,
545
- "grad_norm": 0.29803329706192017,
546
- "learning_rate": 7.044826527479276e-05,
547
- "loss": 0.9497,
548
- "step": 3850
549
- },
550
- {
551
- "epoch": 0.29933405368585037,
552
- "grad_norm": 0.7799173593521118,
553
- "learning_rate": 7.006447651212773e-05,
554
- "loss": 0.9432,
555
- "step": 3900
556
- },
557
- {
558
- "epoch": 0.3031716697587459,
559
- "grad_norm": 0.37847256660461426,
560
- "learning_rate": 6.96806877494627e-05,
561
- "loss": 0.9396,
562
- "step": 3950
563
- },
564
- {
565
- "epoch": 0.3070092858316414,
566
- "grad_norm": 0.4524092972278595,
567
- "learning_rate": 6.929689898679766e-05,
568
- "loss": 0.943,
569
- "step": 4000
570
- },
571
- {
572
- "epoch": 0.3108469019045369,
573
- "grad_norm": 0.6535223126411438,
574
- "learning_rate": 6.891311022413263e-05,
575
- "loss": 0.944,
576
- "step": 4050
577
- },
578
- {
579
- "epoch": 0.3146845179774324,
580
- "grad_norm": 0.5323360562324524,
581
- "learning_rate": 6.852932146146761e-05,
582
- "loss": 0.942,
583
- "step": 4100
584
- },
585
- {
586
- "epoch": 0.3185221340503279,
587
- "grad_norm": 0.784599781036377,
588
- "learning_rate": 6.814553269880258e-05,
589
- "loss": 0.9386,
590
- "step": 4150
591
- },
592
- {
593
- "epoch": 0.32235975012322343,
594
- "grad_norm": 0.4353456497192383,
595
- "learning_rate": 6.776174393613755e-05,
596
- "loss": 0.9406,
597
- "step": 4200
598
- },
599
- {
600
- "epoch": 0.326197366196119,
601
- "grad_norm": 0.5127778053283691,
602
- "learning_rate": 6.737795517347253e-05,
603
- "loss": 0.9374,
604
- "step": 4250
605
- },
606
- {
607
- "epoch": 0.3300349822690145,
608
- "grad_norm": 0.8174408674240112,
609
- "learning_rate": 6.699416641080749e-05,
610
- "loss": 0.932,
611
- "step": 4300
612
- },
613
- {
614
- "epoch": 0.33387259834191,
615
- "grad_norm": 0.2989351153373718,
616
- "learning_rate": 6.661037764814247e-05,
617
- "loss": 0.9345,
618
- "step": 4350
619
- },
620
- {
621
- "epoch": 0.33771021441480553,
622
- "grad_norm": 0.41601112484931946,
623
- "learning_rate": 6.622658888547744e-05,
624
- "loss": 0.9347,
625
- "step": 4400
626
- },
627
- {
628
- "epoch": 0.34154783048770104,
629
- "grad_norm": 0.4532497525215149,
630
- "learning_rate": 6.58428001228124e-05,
631
- "loss": 0.932,
632
- "step": 4450
633
- },
634
- {
635
- "epoch": 0.34538544656059655,
636
- "grad_norm": 0.4111255407333374,
637
- "learning_rate": 6.545901136014739e-05,
638
- "loss": 0.936,
639
- "step": 4500
640
- },
641
- {
642
- "epoch": 0.34922306263349207,
643
- "grad_norm": 0.7033655047416687,
644
- "learning_rate": 6.507522259748234e-05,
645
- "loss": 0.9307,
646
- "step": 4550
647
- },
648
- {
649
- "epoch": 0.3530606787063876,
650
- "grad_norm": 0.4548279643058777,
651
- "learning_rate": 6.469143383481732e-05,
652
- "loss": 0.9278,
653
- "step": 4600
654
- },
655
- {
656
- "epoch": 0.3568982947792831,
657
- "grad_norm": 0.5447307229042053,
658
- "learning_rate": 6.430764507215229e-05,
659
- "loss": 0.9289,
660
- "step": 4650
661
- },
662
- {
663
- "epoch": 0.36073591085217865,
664
- "grad_norm": 0.38505110144615173,
665
- "learning_rate": 6.392385630948726e-05,
666
- "loss": 0.9237,
667
- "step": 4700
668
- },
669
- {
670
- "epoch": 0.36457352692507417,
671
- "grad_norm": 0.40116333961486816,
672
- "learning_rate": 6.354006754682224e-05,
673
- "loss": 0.9248,
674
- "step": 4750
675
- },
676
- {
677
- "epoch": 0.3684111429979697,
678
- "grad_norm": 0.3748728930950165,
679
- "learning_rate": 6.315627878415721e-05,
680
- "loss": 0.9203,
681
- "step": 4800
682
- },
683
- {
684
- "epoch": 0.3722487590708652,
685
- "grad_norm": 0.5459182858467102,
686
- "learning_rate": 6.277249002149218e-05,
687
- "loss": 0.9263,
688
- "step": 4850
689
- },
690
- {
691
- "epoch": 0.3760863751437607,
692
- "grad_norm": 0.5564711093902588,
693
- "learning_rate": 6.238870125882714e-05,
694
- "loss": 0.9242,
695
- "step": 4900
696
- },
697
- {
698
- "epoch": 0.3799239912166562,
699
- "grad_norm": 0.5415127873420715,
700
- "learning_rate": 6.200491249616211e-05,
701
- "loss": 0.9165,
702
- "step": 4950
703
- },
704
- {
705
- "epoch": 0.3837616072895517,
706
- "grad_norm": 0.9149804711341858,
707
- "learning_rate": 6.162112373349709e-05,
708
- "loss": 0.92,
709
- "step": 5000
710
- },
711
- {
712
- "epoch": 0.38759922336244723,
713
- "grad_norm": 0.6962669491767883,
714
- "learning_rate": 6.123733497083206e-05,
715
- "loss": 0.9186,
716
- "step": 5050
717
- },
718
- {
719
- "epoch": 0.39143683943534274,
720
- "grad_norm": 0.6156628131866455,
721
- "learning_rate": 6.085354620816702e-05,
722
- "loss": 0.9139,
723
- "step": 5100
724
- },
725
- {
726
- "epoch": 0.3952744555082383,
727
- "grad_norm": 0.4484277069568634,
728
- "learning_rate": 6.0469757445502e-05,
729
- "loss": 0.914,
730
- "step": 5150
731
- },
732
- {
733
- "epoch": 0.3991120715811338,
734
- "grad_norm": 0.6082286834716797,
735
- "learning_rate": 6.0085968682836965e-05,
736
- "loss": 0.9148,
737
- "step": 5200
738
- },
739
- {
740
- "epoch": 0.40294968765402933,
741
- "grad_norm": 0.6756613850593567,
742
- "learning_rate": 5.970217992017194e-05,
743
- "loss": 0.9137,
744
- "step": 5250
745
- },
746
- {
747
- "epoch": 0.40678730372692484,
748
- "grad_norm": 0.6353741884231567,
749
- "learning_rate": 5.9318391157506915e-05,
750
- "loss": 0.9094,
751
- "step": 5300
752
- },
753
- {
754
- "epoch": 0.41062491979982035,
755
- "grad_norm": 0.6543828845024109,
756
- "learning_rate": 5.893460239484189e-05,
757
- "loss": 0.9089,
758
- "step": 5350
759
- },
760
- {
761
- "epoch": 0.41446253587271586,
762
- "grad_norm": 0.6633620262145996,
763
- "learning_rate": 5.855081363217685e-05,
764
- "loss": 0.9105,
765
- "step": 5400
766
- },
767
- {
768
- "epoch": 0.4183001519456114,
769
- "grad_norm": 0.6769128441810608,
770
- "learning_rate": 5.816702486951182e-05,
771
- "loss": 0.9095,
772
- "step": 5450
773
- },
774
- {
775
- "epoch": 0.4221377680185069,
776
- "grad_norm": 0.6803929209709167,
777
- "learning_rate": 5.7783236106846794e-05,
778
- "loss": 0.9085,
779
- "step": 5500
780
- },
781
- {
782
- "epoch": 0.4259753840914024,
783
- "grad_norm": 0.4861834645271301,
784
- "learning_rate": 5.739944734418177e-05,
785
- "loss": 0.9067,
786
- "step": 5550
787
- },
788
- {
789
- "epoch": 0.42981300016429796,
790
- "grad_norm": 0.24226143956184387,
791
- "learning_rate": 5.7015658581516737e-05,
792
- "loss": 0.9066,
793
- "step": 5600
794
- },
795
- {
796
- "epoch": 0.4336506162371935,
797
- "grad_norm": 0.2108086198568344,
798
- "learning_rate": 5.6631869818851705e-05,
799
- "loss": 0.9061,
800
- "step": 5650
801
- },
802
- {
803
- "epoch": 0.437488232310089,
804
- "grad_norm": 0.7616965770721436,
805
- "learning_rate": 5.624808105618667e-05,
806
- "loss": 0.9041,
807
- "step": 5700
808
- },
809
- {
810
- "epoch": 0.4413258483829845,
811
- "grad_norm": 0.3760414719581604,
812
- "learning_rate": 5.586429229352165e-05,
813
- "loss": 0.9035,
814
- "step": 5750
815
- },
816
- {
817
- "epoch": 0.44516346445588,
818
- "grad_norm": 0.4564415216445923,
819
- "learning_rate": 5.548050353085662e-05,
820
- "loss": 0.902,
821
- "step": 5800
822
- },
823
- {
824
- "epoch": 0.4490010805287755,
825
- "grad_norm": 0.803648054599762,
826
- "learning_rate": 5.509671476819159e-05,
827
- "loss": 0.9011,
828
- "step": 5850
829
- },
830
- {
831
- "epoch": 0.45283869660167103,
832
- "grad_norm": 0.7869254350662231,
833
- "learning_rate": 5.4712926005526565e-05,
834
- "loss": 0.9007,
835
- "step": 5900
836
- },
837
- {
838
- "epoch": 0.45667631267456654,
839
- "grad_norm": 0.8484482765197754,
840
- "learning_rate": 5.4329137242861526e-05,
841
- "loss": 0.902,
842
- "step": 5950
843
- },
844
- {
845
- "epoch": 0.46051392874746205,
846
- "grad_norm": 0.4946975111961365,
847
- "learning_rate": 5.39453484801965e-05,
848
- "loss": 0.8968,
849
- "step": 6000
850
  }
851
  ],
852
- "logging_steps": 50,
853
- "max_steps": 13028,
854
  "num_input_tokens_seen": 0,
855
  "num_train_epochs": 1,
856
- "save_steps": 500,
857
  "stateful_callbacks": {
858
  "TrainerControl": {
859
  "args": {
@@ -866,8 +40,8 @@
866
  "attributes": {}
867
  }
868
  },
869
- "total_flos": 1.0284708509245243e+19,
870
- "train_batch_size": 2,
871
  "trial_name": null,
872
  "trial_params": null
873
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0012275351477837237,
5
+ "eval_steps": 2,
6
+ "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0006137675738918619,
13
+ "grad_norm": 158.1446075439453,
14
+ "learning_rate": 9.993861264579497e-05,
15
+ "loss": 100.2575,
16
+ "step": 1
17
  },
18
  {
19
+ "epoch": 0.0012275351477837237,
20
+ "grad_norm": 105.63041687011719,
21
+ "learning_rate": 9.987722529158994e-05,
22
+ "loss": 95.2722,
23
+ "step": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
  ],
26
+ "logging_steps": 1,
27
+ "max_steps": 1629,
28
  "num_input_tokens_seen": 0,
29
  "num_train_epochs": 1,
30
+ "save_steps": 2,
31
  "stateful_callbacks": {
32
  "TrainerControl": {
33
  "args": {
 
40
  "attributes": {}
41
  }
42
  },
43
+ "total_flos": 919791151165440.0,
44
+ "train_batch_size": 8,
45
  "trial_name": null,
46
  "trial_params": null
47
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65de12ccf65227e16c5ac7d6f4de8c23b93867370e90dd502a95ed85503923fb
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:064b240ea07b11fb2a55256aa70c4f515e16a1e7de5972e80b77b98e19219a68
3
  size 5240