hmankar01 commited on
Commit
3fe69ef
·
verified ·
1 Parent(s): 875ab18

Upload 8 files

Browse files
Files changed (5) hide show
  1. optimizer.pt +3 -0
  2. rng_state.pth +3 -0
  3. scheduler.pt +3 -0
  4. trainer_state.json +465 -0
  5. training_args.bin +3 -0
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc06cbe2a43b0a0c035ed87bf071a180679b4135f5645b2d266c2599065ae1c
3
+ size 33662074
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77d2e2048751f7b82b7d03a4d56de219162dbe5a0a285f2a2e022884f503a580
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c0ba6dee32a29c1b38007c255586352f19a3c1a7bfb24452b6b1f3673e8e047
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9955555555555555,
5
+ "eval_steps": 100,
6
+ "global_step": 562,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.035555555555555556,
13
+ "grad_norm": 11.767955780029297,
14
+ "learning_rate": 0.00019679715302491104,
15
+ "loss": 33.6554,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.07111111111111111,
20
+ "grad_norm": 9.438668251037598,
21
+ "learning_rate": 0.0001932384341637011,
22
+ "loss": 33.8787,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.10666666666666667,
27
+ "grad_norm": 10.174617767333984,
28
+ "learning_rate": 0.00018967971530249112,
29
+ "loss": 33.8919,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.14222222222222222,
34
+ "grad_norm": 8.84274673461914,
35
+ "learning_rate": 0.00018612099644128114,
36
+ "loss": 33.7011,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.17777777777777778,
41
+ "grad_norm": 10.169342041015625,
42
+ "learning_rate": 0.0001825622775800712,
43
+ "loss": 33.6306,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.21333333333333335,
48
+ "grad_norm": 9.339362144470215,
49
+ "learning_rate": 0.0001790035587188612,
50
+ "loss": 33.5378,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.24888888888888888,
55
+ "grad_norm": 10.399051666259766,
56
+ "learning_rate": 0.00017544483985765125,
57
+ "loss": 33.1223,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.28444444444444444,
62
+ "grad_norm": 8.772202491760254,
63
+ "learning_rate": 0.00017188612099644127,
64
+ "loss": 34.3864,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.32,
69
+ "grad_norm": 9.338233947753906,
70
+ "learning_rate": 0.00016832740213523133,
71
+ "loss": 33.2955,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.35555555555555557,
76
+ "grad_norm": 9.439739227294922,
77
+ "learning_rate": 0.00016476868327402135,
78
+ "loss": 33.229,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.35555555555555557,
83
+ "eval_loss": 2.133469820022583,
84
+ "eval_runtime": 296.1668,
85
+ "eval_samples_per_second": 3.376,
86
+ "eval_steps_per_second": 0.422,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.39111111111111113,
91
+ "grad_norm": 9.046673774719238,
92
+ "learning_rate": 0.0001612099644128114,
93
+ "loss": 33.3667,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.4266666666666667,
98
+ "grad_norm": 8.99227237701416,
99
+ "learning_rate": 0.00015765124555160143,
100
+ "loss": 32.6701,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.4622222222222222,
105
+ "grad_norm": 7.6904144287109375,
106
+ "learning_rate": 0.00015409252669039148,
107
+ "loss": 33.2927,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.49777777777777776,
112
+ "grad_norm": 8.012206077575684,
113
+ "learning_rate": 0.00015053380782918148,
114
+ "loss": 33.2934,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.5333333333333333,
119
+ "grad_norm": 10.931622505187988,
120
+ "learning_rate": 0.00014697508896797153,
121
+ "loss": 33.3676,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.5688888888888889,
126
+ "grad_norm": 7.606035232543945,
127
+ "learning_rate": 0.00014341637010676156,
128
+ "loss": 34.1758,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.6044444444444445,
133
+ "grad_norm": 9.531214714050293,
134
+ "learning_rate": 0.0001398576512455516,
135
+ "loss": 33.0847,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.64,
140
+ "grad_norm": 8.761300086975098,
141
+ "learning_rate": 0.00013629893238434164,
142
+ "loss": 33.5206,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.6755555555555556,
147
+ "grad_norm": 9.155729293823242,
148
+ "learning_rate": 0.0001327402135231317,
149
+ "loss": 33.2403,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.7111111111111111,
154
+ "grad_norm": 9.354476928710938,
155
+ "learning_rate": 0.00012918149466192172,
156
+ "loss": 33.5548,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.7111111111111111,
161
+ "eval_loss": 2.126850128173828,
162
+ "eval_runtime": 296.1679,
163
+ "eval_samples_per_second": 3.376,
164
+ "eval_steps_per_second": 0.422,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.7466666666666667,
169
+ "grad_norm": 8.922224998474121,
170
+ "learning_rate": 0.00012562277580071177,
171
+ "loss": 33.279,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.7822222222222223,
176
+ "grad_norm": 9.973633766174316,
177
+ "learning_rate": 0.00012206405693950178,
178
+ "loss": 33.5481,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.8177777777777778,
183
+ "grad_norm": 8.771803855895996,
184
+ "learning_rate": 0.00011850533807829183,
185
+ "loss": 33.1058,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.8533333333333334,
190
+ "grad_norm": 10.16543960571289,
191
+ "learning_rate": 0.00011494661921708185,
192
+ "loss": 33.3706,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.8888888888888888,
197
+ "grad_norm": 9.286821365356445,
198
+ "learning_rate": 0.0001113879003558719,
199
+ "loss": 33.3456,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.9244444444444444,
204
+ "grad_norm": 9.520956039428711,
205
+ "learning_rate": 0.00010782918149466192,
206
+ "loss": 33.5781,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.96,
211
+ "grad_norm": 10.376456260681152,
212
+ "learning_rate": 0.00010427046263345198,
213
+ "loss": 32.9687,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.9955555555555555,
218
+ "grad_norm": 8.36178207397461,
219
+ "learning_rate": 0.00010071174377224199,
220
+ "loss": 33.7239,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 1.0284444444444445,
225
+ "grad_norm": 10.113052368164062,
226
+ "learning_rate": 9.715302491103203e-05,
227
+ "loss": 29.9997,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 1.064,
232
+ "grad_norm": 11.123631477355957,
233
+ "learning_rate": 9.359430604982207e-05,
234
+ "loss": 32.5004,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 1.064,
239
+ "eval_loss": 2.122236490249634,
240
+ "eval_runtime": 296.127,
241
+ "eval_samples_per_second": 3.377,
242
+ "eval_steps_per_second": 0.422,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 1.0995555555555556,
247
+ "grad_norm": 9.897551536560059,
248
+ "learning_rate": 9.00355871886121e-05,
249
+ "loss": 32.5046,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 1.1351111111111112,
254
+ "grad_norm": 9.53073501586914,
255
+ "learning_rate": 8.647686832740213e-05,
256
+ "loss": 32.2727,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 1.1706666666666667,
261
+ "grad_norm": 10.394311904907227,
262
+ "learning_rate": 8.291814946619217e-05,
263
+ "loss": 32.688,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 1.2062222222222223,
268
+ "grad_norm": 9.498970031738281,
269
+ "learning_rate": 7.935943060498221e-05,
270
+ "loss": 33.6316,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 1.2417777777777779,
275
+ "grad_norm": 10.150975227355957,
276
+ "learning_rate": 7.580071174377225e-05,
277
+ "loss": 33.0713,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 1.2773333333333334,
282
+ "grad_norm": 9.899177551269531,
283
+ "learning_rate": 7.224199288256229e-05,
284
+ "loss": 32.4769,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 1.3128888888888888,
289
+ "grad_norm": 9.39831829071045,
290
+ "learning_rate": 6.868327402135231e-05,
291
+ "loss": 32.2654,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 1.3484444444444446,
296
+ "grad_norm": 10.761151313781738,
297
+ "learning_rate": 6.512455516014235e-05,
298
+ "loss": 32.491,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 1.384,
303
+ "grad_norm": 9.932414054870605,
304
+ "learning_rate": 6.156583629893239e-05,
305
+ "loss": 33.5308,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 1.4195555555555557,
310
+ "grad_norm": 11.054327011108398,
311
+ "learning_rate": 5.8007117437722425e-05,
312
+ "loss": 31.7061,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 1.4195555555555557,
317
+ "eval_loss": 2.120673418045044,
318
+ "eval_runtime": 296.1092,
319
+ "eval_samples_per_second": 3.377,
320
+ "eval_steps_per_second": 0.422,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 1.455111111111111,
325
+ "grad_norm": 10.89476203918457,
326
+ "learning_rate": 5.4448398576512464e-05,
327
+ "loss": 32.485,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 1.4906666666666666,
332
+ "grad_norm": 9.823376655578613,
333
+ "learning_rate": 5.0889679715302496e-05,
334
+ "loss": 32.9951,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 1.5262222222222221,
339
+ "grad_norm": 11.316079139709473,
340
+ "learning_rate": 4.733096085409253e-05,
341
+ "loss": 32.3443,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 1.561777777777778,
346
+ "grad_norm": 11.608524322509766,
347
+ "learning_rate": 4.377224199288256e-05,
348
+ "loss": 32.2948,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 1.5973333333333333,
353
+ "grad_norm": 11.020298957824707,
354
+ "learning_rate": 4.02135231316726e-05,
355
+ "loss": 32.6702,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 1.6328888888888888,
360
+ "grad_norm": 9.804555892944336,
361
+ "learning_rate": 3.665480427046263e-05,
362
+ "loss": 31.6452,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 1.6684444444444444,
367
+ "grad_norm": 11.037073135375977,
368
+ "learning_rate": 3.309608540925267e-05,
369
+ "loss": 32.479,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 1.704,
374
+ "grad_norm": 9.837021827697754,
375
+ "learning_rate": 2.9537366548042704e-05,
376
+ "loss": 32.72,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 1.7395555555555555,
381
+ "grad_norm": 11.720721244812012,
382
+ "learning_rate": 2.597864768683274e-05,
383
+ "loss": 32.6789,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 1.775111111111111,
388
+ "grad_norm": 11.738125801086426,
389
+ "learning_rate": 2.2419928825622775e-05,
390
+ "loss": 33.3128,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 1.775111111111111,
395
+ "eval_loss": 2.11881947517395,
396
+ "eval_runtime": 296.1216,
397
+ "eval_samples_per_second": 3.377,
398
+ "eval_steps_per_second": 0.422,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 1.8106666666666666,
403
+ "grad_norm": 11.249613761901855,
404
+ "learning_rate": 1.8861209964412814e-05,
405
+ "loss": 31.9298,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 1.8462222222222222,
410
+ "grad_norm": 11.530637741088867,
411
+ "learning_rate": 1.530249110320285e-05,
412
+ "loss": 31.8878,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 1.8817777777777778,
417
+ "grad_norm": 11.147592544555664,
418
+ "learning_rate": 1.1743772241992882e-05,
419
+ "loss": 32.6852,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 1.9173333333333333,
424
+ "grad_norm": 9.81916332244873,
425
+ "learning_rate": 8.185053380782918e-06,
426
+ "loss": 32.1578,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 1.952888888888889,
431
+ "grad_norm": 10.557317733764648,
432
+ "learning_rate": 4.626334519572954e-06,
433
+ "loss": 32.2151,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 1.9884444444444445,
438
+ "grad_norm": 10.493524551391602,
439
+ "learning_rate": 1.0676156583629894e-06,
440
+ "loss": 31.9549,
441
+ "step": 560
442
+ }
443
+ ],
444
+ "logging_steps": 10,
445
+ "max_steps": 562,
446
+ "num_input_tokens_seen": 0,
447
+ "num_train_epochs": 2,
448
+ "save_steps": 100,
449
+ "stateful_callbacks": {
450
+ "TrainerControl": {
451
+ "args": {
452
+ "should_epoch_stop": false,
453
+ "should_evaluate": false,
454
+ "should_log": false,
455
+ "should_save": true,
456
+ "should_training_stop": true
457
+ },
458
+ "attributes": {}
459
+ }
460
+ },
461
+ "total_flos": 5.93400073703424e+16,
462
+ "train_batch_size": 2,
463
+ "trial_name": null,
464
+ "trial_params": null
465
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f8fc3b0faf567e506d36321edce6eabba9e66e2254f1e24c9840de83a3bb7cb
3
+ size 5304