robertou2 commited on
Commit
507f518
·
verified ·
1 Parent(s): c63b8af

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,21 +12,21 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 64,
16
  "lora_bias": false,
17
  "lora_dropout": 0.0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 32,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "gate_up_proj",
27
  "qkv_proj",
28
- "down_proj",
29
- "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 128,
16
  "lora_bias": false,
17
  "lora_dropout": 0.0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 64,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "o_proj",
27
  "qkv_proj",
28
+ "gate_up_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:378f8c38bcf12f87f8847d23731af2991c592bdae2d26ced247af7178fd27265
3
- size 184584072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89daabbba6e8fbb223ea2dadd6cafc0473e7831b6a8a31965fd25a932502f63a
3
+ size 369133600
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fd80ea678ab790a886af1b74bedb36f04b6ae3df559d9ae69f59dc7343c1bcf
3
- size 369315019
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53b455474ae0967c9dc13e4a970734e024e902b0e0593a5ebcb87d06c41d8d84
3
+ size 738413771
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bba62abab919d97e86e665bcc2b30cdef0058dbbf59538563dd656f3b9d42e6
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532826bfd6ab0d9c120628c9ba8dbb5c027e661038baccc2d23e0946927a6e4b
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd5c6c97d40727b5ce95e0b935d6d973c3b68a39460f9423ef7a3bc12f3b4643
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2692bc6d8e85cffcbf5a9395fe1431563b67e6de43846a9570babfa256843214
3
  size 1465
trainer_state.json CHANGED
@@ -1,530 +1,770 @@
1
  {
2
- "best_global_step": 34,
3
- "best_metric": 0.8157733678817749,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-34",
5
- "epoch": 2.2666666666666666,
6
  "eval_steps": 1,
7
- "global_step": 34,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.06666666666666667,
14
- "grad_norm": 1.6772907972335815,
15
  "learning_rate": 0.0,
16
- "loss": 3.0786,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
  "eval_loss": 3.15524959564209,
22
- "eval_runtime": 3.3465,
23
- "eval_samples_per_second": 8.965,
24
- "eval_steps_per_second": 1.195,
25
  "step": 1
26
  },
27
  {
28
- "epoch": 0.13333333333333333,
29
- "grad_norm": 1.7768125534057617,
30
  "learning_rate": 3.3333333333333335e-05,
31
- "loss": 3.0737,
32
  "step": 2
33
  },
34
  {
35
- "epoch": 0.13333333333333333,
36
- "eval_loss": 3.0767996311187744,
37
- "eval_runtime": 3.3543,
38
- "eval_samples_per_second": 8.944,
39
- "eval_steps_per_second": 1.192,
40
  "step": 2
41
  },
42
  {
43
- "epoch": 0.2,
44
- "grad_norm": 1.5412702560424805,
45
  "learning_rate": 6.666666666666667e-05,
46
- "loss": 2.9943,
47
  "step": 3
48
  },
49
  {
50
- "epoch": 0.2,
51
- "eval_loss": 2.8993334770202637,
52
- "eval_runtime": 3.3523,
53
- "eval_samples_per_second": 8.949,
54
- "eval_steps_per_second": 1.193,
55
  "step": 3
56
  },
57
  {
58
- "epoch": 0.26666666666666666,
59
- "grad_norm": 1.4991670846939087,
60
  "learning_rate": 0.0001,
61
- "loss": 2.8684,
62
  "step": 4
63
  },
64
  {
65
- "epoch": 0.26666666666666666,
66
- "eval_loss": 2.637805461883545,
67
- "eval_runtime": 3.3436,
68
- "eval_samples_per_second": 8.972,
69
- "eval_steps_per_second": 1.196,
70
  "step": 4
71
  },
72
  {
73
- "epoch": 0.3333333333333333,
74
- "grad_norm": 1.240314245223999,
75
  "learning_rate": 0.00013333333333333334,
76
- "loss": 2.5164,
77
  "step": 5
78
  },
79
  {
80
- "epoch": 0.3333333333333333,
81
- "eval_loss": 2.359757423400879,
82
- "eval_runtime": 3.3265,
83
- "eval_samples_per_second": 9.018,
84
- "eval_steps_per_second": 1.202,
85
  "step": 5
86
  },
87
  {
88
- "epoch": 0.4,
89
- "grad_norm": 1.1895383596420288,
90
  "learning_rate": 0.00016666666666666666,
91
- "loss": 2.314,
92
  "step": 6
93
  },
94
  {
95
- "epoch": 0.4,
96
- "eval_loss": 2.1061525344848633,
97
- "eval_runtime": 3.317,
98
- "eval_samples_per_second": 9.044,
99
- "eval_steps_per_second": 1.206,
100
  "step": 6
101
  },
102
  {
103
- "epoch": 0.4666666666666667,
104
- "grad_norm": 1.2350431680679321,
105
  "learning_rate": 0.0002,
106
- "loss": 2.0978,
107
  "step": 7
108
  },
109
  {
110
- "epoch": 0.4666666666666667,
111
- "eval_loss": 1.862547755241394,
112
- "eval_runtime": 3.3088,
113
- "eval_samples_per_second": 9.067,
114
- "eval_steps_per_second": 1.209,
115
  "step": 7
116
  },
117
  {
118
- "epoch": 0.5333333333333333,
119
- "grad_norm": 1.6615718603134155,
120
  "learning_rate": 0.00023333333333333333,
121
- "loss": 1.8403,
122
  "step": 8
123
  },
124
  {
125
- "epoch": 0.5333333333333333,
126
- "eval_loss": 1.627223253250122,
127
- "eval_runtime": 3.3091,
128
- "eval_samples_per_second": 9.066,
129
- "eval_steps_per_second": 1.209,
130
  "step": 8
131
  },
132
  {
133
- "epoch": 0.6,
134
- "grad_norm": 1.5987708568572998,
135
  "learning_rate": 0.0002666666666666667,
136
- "loss": 1.6561,
137
  "step": 9
138
  },
139
  {
140
- "epoch": 0.6,
141
- "eval_loss": 1.463124394416809,
142
- "eval_runtime": 3.3213,
143
- "eval_samples_per_second": 9.033,
144
- "eval_steps_per_second": 1.204,
145
  "step": 9
146
  },
147
  {
148
- "epoch": 0.6666666666666666,
149
- "grad_norm": 1.553259015083313,
150
  "learning_rate": 0.0003,
151
- "loss": 1.532,
152
  "step": 10
153
  },
154
  {
155
- "epoch": 0.6666666666666666,
156
- "eval_loss": 1.3557301759719849,
157
- "eval_runtime": 3.3346,
158
- "eval_samples_per_second": 8.996,
159
- "eval_steps_per_second": 1.2,
160
  "step": 10
161
  },
162
  {
163
- "epoch": 0.7333333333333333,
164
- "grad_norm": 1.5410878658294678,
165
  "learning_rate": 0.0003333333333333333,
166
- "loss": 1.3789,
167
  "step": 11
168
  },
169
  {
170
- "epoch": 0.7333333333333333,
171
- "eval_loss": 1.284977674484253,
172
- "eval_runtime": 3.3397,
173
- "eval_samples_per_second": 8.983,
174
- "eval_steps_per_second": 1.198,
175
  "step": 11
176
  },
177
  {
178
- "epoch": 0.8,
179
- "grad_norm": 1.5387530326843262,
180
  "learning_rate": 0.00036666666666666667,
181
- "loss": 1.3658,
182
  "step": 12
183
  },
184
  {
185
- "epoch": 0.8,
186
- "eval_loss": 1.2496088743209839,
187
- "eval_runtime": 3.3461,
188
- "eval_samples_per_second": 8.966,
189
- "eval_steps_per_second": 1.195,
190
  "step": 12
191
  },
192
  {
193
- "epoch": 0.8666666666666667,
194
- "grad_norm": 1.2438753843307495,
195
  "learning_rate": 0.0004,
196
- "loss": 1.3617,
197
  "step": 13
198
  },
199
  {
200
- "epoch": 0.8666666666666667,
201
- "eval_loss": 1.1902137994766235,
202
- "eval_runtime": 3.3367,
203
- "eval_samples_per_second": 8.991,
204
- "eval_steps_per_second": 1.199,
205
  "step": 13
206
  },
207
  {
208
- "epoch": 0.9333333333333333,
209
- "grad_norm": 0.8875225186347961,
210
  "learning_rate": 0.00043333333333333337,
211
- "loss": 1.1692,
212
  "step": 14
213
  },
214
  {
215
- "epoch": 0.9333333333333333,
216
- "eval_loss": 1.129626989364624,
217
- "eval_runtime": 3.3354,
218
- "eval_samples_per_second": 8.994,
219
- "eval_steps_per_second": 1.199,
220
  "step": 14
221
  },
222
  {
223
- "epoch": 1.0,
224
- "grad_norm": 0.9996999502182007,
225
  "learning_rate": 0.00046666666666666666,
226
- "loss": 1.3193,
227
  "step": 15
228
  },
229
  {
230
- "epoch": 1.0,
231
- "eval_loss": 1.0915361642837524,
232
- "eval_runtime": 3.3299,
233
- "eval_samples_per_second": 9.009,
234
- "eval_steps_per_second": 1.201,
235
  "step": 15
236
  },
237
  {
238
- "epoch": 1.0666666666666667,
239
- "grad_norm": 0.8160541653633118,
240
  "learning_rate": 0.0005,
241
- "loss": 1.0422,
242
  "step": 16
243
  },
244
  {
245
- "epoch": 1.0666666666666667,
246
- "eval_loss": 1.0750960111618042,
247
- "eval_runtime": 3.3294,
248
- "eval_samples_per_second": 9.011,
249
- "eval_steps_per_second": 1.201,
250
  "step": 16
251
  },
252
  {
253
- "epoch": 1.1333333333333333,
254
- "grad_norm": 0.8319222927093506,
255
  "learning_rate": 0.0004993910125649561,
256
- "loss": 1.1637,
257
  "step": 17
258
  },
259
  {
260
- "epoch": 1.1333333333333333,
261
- "eval_loss": 1.0480690002441406,
262
- "eval_runtime": 3.3231,
263
- "eval_samples_per_second": 9.028,
264
- "eval_steps_per_second": 1.204,
265
  "step": 17
266
  },
267
  {
268
- "epoch": 1.2,
269
- "grad_norm": 0.7125590443611145,
270
  "learning_rate": 0.0004975670171853926,
271
- "loss": 1.0326,
272
  "step": 18
273
  },
274
  {
275
- "epoch": 1.2,
276
- "eval_loss": 1.0194019079208374,
277
- "eval_runtime": 3.3294,
278
- "eval_samples_per_second": 9.011,
279
- "eval_steps_per_second": 1.201,
280
  "step": 18
281
  },
282
  {
283
- "epoch": 1.2666666666666666,
284
- "grad_norm": 0.8782016038894653,
285
  "learning_rate": 0.0004945369001834514,
286
- "loss": 1.018,
287
  "step": 19
288
  },
289
  {
290
- "epoch": 1.2666666666666666,
291
- "eval_loss": 1.0099557638168335,
292
- "eval_runtime": 3.3268,
293
- "eval_samples_per_second": 9.018,
294
- "eval_steps_per_second": 1.202,
295
  "step": 19
296
  },
297
  {
298
- "epoch": 1.3333333333333333,
299
- "grad_norm": 0.6835053563117981,
300
  "learning_rate": 0.0004903154239845797,
301
- "loss": 1.141,
302
  "step": 20
303
  },
304
  {
305
- "epoch": 1.3333333333333333,
306
- "eval_loss": 1.0006548166275024,
307
- "eval_runtime": 3.3331,
308
- "eval_samples_per_second": 9.001,
309
- "eval_steps_per_second": 1.2,
310
  "step": 20
311
  },
312
  {
313
- "epoch": 1.4,
314
- "grad_norm": 0.8351470232009888,
315
  "learning_rate": 0.0004849231551964771,
316
- "loss": 1.1354,
317
  "step": 21
318
  },
319
  {
320
- "epoch": 1.4,
321
- "eval_loss": 0.9695132374763489,
322
- "eval_runtime": 3.3403,
323
- "eval_samples_per_second": 8.981,
324
- "eval_steps_per_second": 1.197,
325
  "step": 21
326
  },
327
  {
328
- "epoch": 1.4666666666666668,
329
- "grad_norm": 0.5992692708969116,
330
  "learning_rate": 0.0004783863644106502,
331
- "loss": 0.9994,
332
  "step": 22
333
  },
334
  {
335
- "epoch": 1.4666666666666668,
336
- "eval_loss": 0.9532836675643921,
337
- "eval_runtime": 3.34,
338
- "eval_samples_per_second": 8.982,
339
- "eval_steps_per_second": 1.198,
340
  "step": 22
341
  },
342
  {
343
- "epoch": 1.5333333333333332,
344
- "grad_norm": 0.6349149346351624,
345
  "learning_rate": 0.00047073689821473173,
346
- "loss": 1.0141,
347
  "step": 23
348
  },
349
  {
350
- "epoch": 1.5333333333333332,
351
- "eval_loss": 0.9443845152854919,
352
- "eval_runtime": 3.3307,
353
- "eval_samples_per_second": 9.007,
354
- "eval_steps_per_second": 1.201,
355
  "step": 23
356
  },
357
  {
358
- "epoch": 1.6,
359
- "grad_norm": 0.6412695646286011,
360
  "learning_rate": 0.00046201202403910646,
361
- "loss": 0.9325,
362
  "step": 24
363
  },
364
  {
365
- "epoch": 1.6,
366
- "eval_loss": 0.9353991150856018,
367
- "eval_runtime": 3.3263,
368
- "eval_samples_per_second": 9.019,
369
- "eval_steps_per_second": 1.203,
370
  "step": 24
371
  },
372
  {
373
- "epoch": 1.6666666666666665,
374
- "grad_norm": 0.6291660070419312,
375
  "learning_rate": 0.0004522542485937369,
376
- "loss": 0.9628,
377
  "step": 25
378
  },
379
  {
380
- "epoch": 1.6666666666666665,
381
- "eval_loss": 0.9189165830612183,
382
- "eval_runtime": 3.3278,
383
- "eval_samples_per_second": 9.015,
384
- "eval_steps_per_second": 1.202,
385
  "step": 25
386
  },
387
  {
388
- "epoch": 1.7333333333333334,
389
- "grad_norm": 0.6544055342674255,
390
  "learning_rate": 0.0004415111107797445,
391
- "loss": 0.9646,
392
  "step": 26
393
  },
394
  {
395
- "epoch": 1.7333333333333334,
396
- "eval_loss": 0.9056078195571899,
397
- "eval_runtime": 3.3284,
398
- "eval_samples_per_second": 9.013,
399
- "eval_steps_per_second": 1.202,
400
  "step": 26
401
  },
402
  {
403
- "epoch": 1.8,
404
- "grad_norm": 0.6583496928215027,
405
  "learning_rate": 0.0004298349500846628,
406
- "loss": 1.0333,
407
  "step": 27
408
  },
409
  {
410
- "epoch": 1.8,
411
- "eval_loss": 0.8940725922584534,
412
- "eval_runtime": 3.3318,
413
- "eval_samples_per_second": 9.004,
414
- "eval_steps_per_second": 1.201,
415
  "step": 27
416
  },
417
  {
418
- "epoch": 1.8666666666666667,
419
- "grad_norm": 0.623849093914032,
420
  "learning_rate": 0.0004172826515897146,
421
- "loss": 1.0129,
422
  "step": 28
423
  },
424
  {
425
- "epoch": 1.8666666666666667,
426
- "eval_loss": 0.8719626665115356,
427
- "eval_runtime": 3.3349,
428
- "eval_samples_per_second": 8.996,
429
- "eval_steps_per_second": 1.199,
430
  "step": 28
431
  },
432
  {
433
- "epoch": 1.9333333333333333,
434
- "grad_norm": 0.6031587719917297,
435
  "learning_rate": 0.00040391536883141455,
436
- "loss": 0.9091,
437
  "step": 29
438
  },
439
  {
440
- "epoch": 1.9333333333333333,
441
- "eval_loss": 0.8551884889602661,
442
- "eval_runtime": 3.3315,
443
- "eval_samples_per_second": 9.005,
444
- "eval_steps_per_second": 1.201,
445
  "step": 29
446
  },
447
  {
448
- "epoch": 2.0,
449
- "grad_norm": 0.55727219581604,
450
  "learning_rate": 0.0003897982258676867,
451
- "loss": 0.9028,
452
  "step": 30
453
  },
454
  {
455
- "epoch": 2.0,
456
- "eval_loss": 0.8516466617584229,
457
- "eval_runtime": 3.329,
458
- "eval_samples_per_second": 9.012,
459
- "eval_steps_per_second": 1.202,
460
  "step": 30
461
  },
462
  {
463
- "epoch": 2.066666666666667,
464
- "grad_norm": 0.7247292399406433,
465
  "learning_rate": 0.000375,
466
- "loss": 0.8681,
467
  "step": 31
468
  },
469
  {
470
- "epoch": 2.066666666666667,
471
- "eval_loss": 0.8430901765823364,
472
- "eval_runtime": 3.3279,
473
- "eval_samples_per_second": 9.015,
474
- "eval_steps_per_second": 1.202,
475
  "step": 31
476
  },
477
  {
478
- "epoch": 2.1333333333333333,
479
- "grad_norm": 0.5927403569221497,
480
  "learning_rate": 0.00035959278669726934,
481
- "loss": 0.8846,
482
  "step": 32
483
  },
484
  {
485
- "epoch": 2.1333333333333333,
486
- "eval_loss": 0.8356520533561707,
487
- "eval_runtime": 3.3256,
488
- "eval_samples_per_second": 9.021,
489
- "eval_steps_per_second": 1.203,
490
  "step": 32
491
  },
492
  {
493
- "epoch": 2.2,
494
- "grad_norm": 0.4770275950431824,
495
  "learning_rate": 0.00034365164835397803,
496
- "loss": 0.8181,
497
  "step": 33
498
  },
499
  {
500
- "epoch": 2.2,
501
- "eval_loss": 0.8293011784553528,
502
- "eval_runtime": 3.3314,
503
- "eval_samples_per_second": 9.005,
504
- "eval_steps_per_second": 1.201,
505
  "step": 33
506
  },
507
  {
508
- "epoch": 2.2666666666666666,
509
- "grad_norm": 0.5398544073104858,
510
  "learning_rate": 0.00032725424859373687,
511
- "loss": 0.8316,
512
  "step": 34
513
  },
514
  {
515
- "epoch": 2.2666666666666666,
516
- "eval_loss": 0.8157733678817749,
517
- "eval_runtime": 3.3286,
518
- "eval_samples_per_second": 9.013,
519
- "eval_steps_per_second": 1.202,
520
  "step": 34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  }
522
  ],
523
  "logging_steps": 1,
524
  "max_steps": 60,
525
  "num_input_tokens_seen": 0,
526
  "num_train_epochs": 4,
527
- "save_steps": 1,
528
  "stateful_callbacks": {
529
  "TrainerControl": {
530
  "args": {
@@ -537,7 +777,7 @@
537
  "attributes": {}
538
  }
539
  },
540
- "total_flos": 1386766593552384.0,
541
  "train_batch_size": 1,
542
  "trial_name": null,
543
  "trial_params": null
 
1
  {
2
+ "best_global_step": 50,
3
+ "best_metric": 0.6460065841674805,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-50",
5
+ "epoch": 2.6315789473684212,
6
  "eval_steps": 1,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.05263157894736842,
14
+ "grad_norm": 2.3607187271118164,
15
  "learning_rate": 0.0,
16
+ "loss": 3.2235,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.05263157894736842,
21
  "eval_loss": 3.15524959564209,
22
+ "eval_runtime": 3.3685,
23
+ "eval_samples_per_second": 8.906,
24
+ "eval_steps_per_second": 1.187,
25
  "step": 1
26
  },
27
  {
28
+ "epoch": 0.10526315789473684,
29
+ "grad_norm": 2.431220531463623,
30
  "learning_rate": 3.3333333333333335e-05,
31
+ "loss": 3.165,
32
  "step": 2
33
  },
34
  {
35
+ "epoch": 0.10526315789473684,
36
+ "eval_loss": 3.0020461082458496,
37
+ "eval_runtime": 3.299,
38
+ "eval_samples_per_second": 9.094,
39
+ "eval_steps_per_second": 1.212,
40
  "step": 2
41
  },
42
  {
43
+ "epoch": 0.15789473684210525,
44
+ "grad_norm": 1.8372516632080078,
45
  "learning_rate": 6.666666666666667e-05,
46
+ "loss": 2.7821,
47
  "step": 3
48
  },
49
  {
50
+ "epoch": 0.15789473684210525,
51
+ "eval_loss": 2.6930112838745117,
52
+ "eval_runtime": 3.3119,
53
+ "eval_samples_per_second": 9.058,
54
+ "eval_steps_per_second": 1.208,
55
  "step": 3
56
  },
57
  {
58
+ "epoch": 0.21052631578947367,
59
+ "grad_norm": 1.6948609352111816,
60
  "learning_rate": 0.0001,
61
+ "loss": 2.7014,
62
  "step": 4
63
  },
64
  {
65
+ "epoch": 0.21052631578947367,
66
+ "eval_loss": 2.349722146987915,
67
+ "eval_runtime": 3.3194,
68
+ "eval_samples_per_second": 9.038,
69
+ "eval_steps_per_second": 1.205,
70
  "step": 4
71
  },
72
  {
73
+ "epoch": 0.2631578947368421,
74
+ "grad_norm": 1.333439826965332,
75
  "learning_rate": 0.00013333333333333334,
76
+ "loss": 2.3248,
77
  "step": 5
78
  },
79
  {
80
+ "epoch": 0.2631578947368421,
81
+ "eval_loss": 2.06449294090271,
82
+ "eval_runtime": 3.331,
83
+ "eval_samples_per_second": 9.006,
84
+ "eval_steps_per_second": 1.201,
85
  "step": 5
86
  },
87
  {
88
+ "epoch": 0.3157894736842105,
89
+ "grad_norm": 1.3499835729599,
90
  "learning_rate": 0.00016666666666666666,
91
+ "loss": 2.0069,
92
  "step": 6
93
  },
94
  {
95
+ "epoch": 0.3157894736842105,
96
+ "eval_loss": 1.8060506582260132,
97
+ "eval_runtime": 3.3417,
98
+ "eval_samples_per_second": 8.977,
99
+ "eval_steps_per_second": 1.197,
100
  "step": 6
101
  },
102
  {
103
+ "epoch": 0.3684210526315789,
104
+ "grad_norm": 1.649509310722351,
105
  "learning_rate": 0.0002,
106
+ "loss": 1.8274,
107
  "step": 7
108
  },
109
  {
110
+ "epoch": 0.3684210526315789,
111
+ "eval_loss": 1.554451584815979,
112
+ "eval_runtime": 3.3577,
113
+ "eval_samples_per_second": 8.935,
114
+ "eval_steps_per_second": 1.191,
115
  "step": 7
116
  },
117
  {
118
+ "epoch": 0.42105263157894735,
119
+ "grad_norm": 1.6009737253189087,
120
  "learning_rate": 0.00023333333333333333,
121
+ "loss": 1.5562,
122
  "step": 8
123
  },
124
  {
125
+ "epoch": 0.42105263157894735,
126
+ "eval_loss": 1.3974536657333374,
127
+ "eval_runtime": 3.3654,
128
+ "eval_samples_per_second": 8.914,
129
+ "eval_steps_per_second": 1.189,
130
  "step": 8
131
  },
132
  {
133
+ "epoch": 0.47368421052631576,
134
+ "grad_norm": 1.7731741666793823,
135
  "learning_rate": 0.0002666666666666667,
136
+ "loss": 1.4525,
137
  "step": 9
138
  },
139
  {
140
+ "epoch": 0.47368421052631576,
141
+ "eval_loss": 1.3451876640319824,
142
+ "eval_runtime": 3.3744,
143
+ "eval_samples_per_second": 8.89,
144
+ "eval_steps_per_second": 1.185,
145
  "step": 9
146
  },
147
  {
148
+ "epoch": 0.5263157894736842,
149
+ "grad_norm": 1.629805088043213,
150
  "learning_rate": 0.0003,
151
+ "loss": 1.4081,
152
  "step": 10
153
  },
154
  {
155
+ "epoch": 0.5263157894736842,
156
+ "eval_loss": 1.2556439638137817,
157
+ "eval_runtime": 3.3912,
158
+ "eval_samples_per_second": 8.847,
159
+ "eval_steps_per_second": 1.18,
160
  "step": 10
161
  },
162
  {
163
+ "epoch": 0.5789473684210527,
164
+ "grad_norm": 1.313006043434143,
165
  "learning_rate": 0.0003333333333333333,
166
+ "loss": 1.3422,
167
  "step": 11
168
  },
169
  {
170
+ "epoch": 0.5789473684210527,
171
+ "eval_loss": 1.1746076345443726,
172
+ "eval_runtime": 3.3899,
173
+ "eval_samples_per_second": 8.85,
174
+ "eval_steps_per_second": 1.18,
175
  "step": 11
176
  },
177
  {
178
+ "epoch": 0.631578947368421,
179
+ "grad_norm": 0.9396845698356628,
180
  "learning_rate": 0.00036666666666666667,
181
+ "loss": 1.2091,
182
  "step": 12
183
  },
184
  {
185
+ "epoch": 0.631578947368421,
186
+ "eval_loss": 1.1337084770202637,
187
+ "eval_runtime": 3.3948,
188
+ "eval_samples_per_second": 8.837,
189
+ "eval_steps_per_second": 1.178,
190
  "step": 12
191
  },
192
  {
193
+ "epoch": 0.6842105263157895,
194
+ "grad_norm": 1.076097846031189,
195
  "learning_rate": 0.0004,
196
+ "loss": 1.1891,
197
  "step": 13
198
  },
199
  {
200
+ "epoch": 0.6842105263157895,
201
+ "eval_loss": 1.0741407871246338,
202
+ "eval_runtime": 3.3911,
203
+ "eval_samples_per_second": 8.847,
204
+ "eval_steps_per_second": 1.18,
205
  "step": 13
206
  },
207
  {
208
+ "epoch": 0.7368421052631579,
209
+ "grad_norm": 0.8671520352363586,
210
  "learning_rate": 0.00043333333333333337,
211
+ "loss": 1.0924,
212
  "step": 14
213
  },
214
  {
215
+ "epoch": 0.7368421052631579,
216
+ "eval_loss": 1.050424575805664,
217
+ "eval_runtime": 3.3794,
218
+ "eval_samples_per_second": 8.877,
219
+ "eval_steps_per_second": 1.184,
220
  "step": 14
221
  },
222
  {
223
+ "epoch": 0.7894736842105263,
224
+ "grad_norm": 0.8102416396141052,
225
  "learning_rate": 0.00046666666666666666,
226
+ "loss": 1.1182,
227
  "step": 15
228
  },
229
  {
230
+ "epoch": 0.7894736842105263,
231
+ "eval_loss": 1.02986741065979,
232
+ "eval_runtime": 3.3781,
233
+ "eval_samples_per_second": 8.881,
234
+ "eval_steps_per_second": 1.184,
235
  "step": 15
236
  },
237
  {
238
+ "epoch": 0.8421052631578947,
239
+ "grad_norm": 1.4678000211715698,
240
  "learning_rate": 0.0005,
241
+ "loss": 1.1182,
242
  "step": 16
243
  },
244
  {
245
+ "epoch": 0.8421052631578947,
246
+ "eval_loss": 1.0076123476028442,
247
+ "eval_runtime": 3.3672,
248
+ "eval_samples_per_second": 8.91,
249
+ "eval_steps_per_second": 1.188,
250
  "step": 16
251
  },
252
  {
253
+ "epoch": 0.8947368421052632,
254
+ "grad_norm": 0.8557516932487488,
255
  "learning_rate": 0.0004993910125649561,
256
+ "loss": 1.1433,
257
  "step": 17
258
  },
259
  {
260
+ "epoch": 0.8947368421052632,
261
+ "eval_loss": 0.9948338866233826,
262
+ "eval_runtime": 3.3641,
263
+ "eval_samples_per_second": 8.918,
264
+ "eval_steps_per_second": 1.189,
265
  "step": 17
266
  },
267
  {
268
+ "epoch": 0.9473684210526315,
269
+ "grad_norm": 1.181545376777649,
270
  "learning_rate": 0.0004975670171853926,
271
+ "loss": 1.0207,
272
  "step": 18
273
  },
274
  {
275
+ "epoch": 0.9473684210526315,
276
+ "eval_loss": 0.959977924823761,
277
+ "eval_runtime": 3.3618,
278
+ "eval_samples_per_second": 8.924,
279
+ "eval_steps_per_second": 1.19,
280
  "step": 18
281
  },
282
  {
283
+ "epoch": 1.0,
284
+ "grad_norm": 0.7064942121505737,
285
  "learning_rate": 0.0004945369001834514,
286
+ "loss": 1.0768,
287
  "step": 19
288
  },
289
  {
290
+ "epoch": 1.0,
291
+ "eval_loss": 0.9442862272262573,
292
+ "eval_runtime": 3.3598,
293
+ "eval_samples_per_second": 8.929,
294
+ "eval_steps_per_second": 1.191,
295
  "step": 19
296
  },
297
  {
298
+ "epoch": 1.0526315789473684,
299
+ "grad_norm": 0.7763754725456238,
300
  "learning_rate": 0.0004903154239845797,
301
+ "loss": 0.9409,
302
  "step": 20
303
  },
304
  {
305
+ "epoch": 1.0526315789473684,
306
+ "eval_loss": 0.9225653409957886,
307
+ "eval_runtime": 3.3593,
308
+ "eval_samples_per_second": 8.93,
309
+ "eval_steps_per_second": 1.191,
310
  "step": 20
311
  },
312
  {
313
+ "epoch": 1.1052631578947367,
314
+ "grad_norm": 0.6782916188240051,
315
  "learning_rate": 0.0004849231551964771,
316
+ "loss": 0.9597,
317
  "step": 21
318
  },
319
  {
320
+ "epoch": 1.1052631578947367,
321
+ "eval_loss": 0.9122769832611084,
322
+ "eval_runtime": 3.3624,
323
+ "eval_samples_per_second": 8.922,
324
+ "eval_steps_per_second": 1.19,
325
  "step": 21
326
  },
327
  {
328
+ "epoch": 1.1578947368421053,
329
+ "grad_norm": 0.638238251209259,
330
  "learning_rate": 0.0004783863644106502,
331
+ "loss": 0.9609,
332
  "step": 22
333
  },
334
  {
335
+ "epoch": 1.1578947368421053,
336
+ "eval_loss": 0.8951469659805298,
337
+ "eval_runtime": 3.3744,
338
+ "eval_samples_per_second": 8.891,
339
+ "eval_steps_per_second": 1.185,
340
  "step": 22
341
  },
342
  {
343
+ "epoch": 1.2105263157894737,
344
+ "grad_norm": 0.6865942478179932,
345
  "learning_rate": 0.00047073689821473173,
346
+ "loss": 0.894,
347
  "step": 23
348
  },
349
  {
350
+ "epoch": 1.2105263157894737,
351
+ "eval_loss": 0.8961806893348694,
352
+ "eval_runtime": 3.3869,
353
+ "eval_samples_per_second": 8.858,
354
+ "eval_steps_per_second": 1.181,
355
  "step": 23
356
  },
357
  {
358
+ "epoch": 1.263157894736842,
359
+ "grad_norm": 0.7614845633506775,
360
  "learning_rate": 0.00046201202403910646,
361
+ "loss": 0.9654,
362
  "step": 24
363
  },
364
  {
365
+ "epoch": 1.263157894736842,
366
+ "eval_loss": 0.9240673184394836,
367
+ "eval_runtime": 3.3864,
368
+ "eval_samples_per_second": 8.859,
369
+ "eval_steps_per_second": 1.181,
370
  "step": 24
371
  },
372
  {
373
+ "epoch": 1.3157894736842106,
374
+ "grad_norm": 0.8841014504432678,
375
  "learning_rate": 0.0004522542485937369,
376
+ "loss": 0.8996,
377
  "step": 25
378
  },
379
  {
380
+ "epoch": 1.3157894736842106,
381
+ "eval_loss": 0.8987072706222534,
382
+ "eval_runtime": 3.3804,
383
+ "eval_samples_per_second": 8.875,
384
+ "eval_steps_per_second": 1.183,
385
  "step": 25
386
  },
387
  {
388
+ "epoch": 1.368421052631579,
389
+ "grad_norm": 0.695126473903656,
390
  "learning_rate": 0.0004415111107797445,
391
+ "loss": 0.9224,
392
  "step": 26
393
  },
394
  {
395
+ "epoch": 1.368421052631579,
396
+ "eval_loss": 0.8950093388557434,
397
+ "eval_runtime": 3.3744,
398
+ "eval_samples_per_second": 8.89,
399
+ "eval_steps_per_second": 1.185,
400
  "step": 26
401
  },
402
  {
403
+ "epoch": 1.4210526315789473,
404
+ "grad_norm": 0.6917558908462524,
405
  "learning_rate": 0.0004298349500846628,
406
+ "loss": 0.8954,
407
  "step": 27
408
  },
409
  {
410
+ "epoch": 1.4210526315789473,
411
+ "eval_loss": 0.8965355157852173,
412
+ "eval_runtime": 3.3739,
413
+ "eval_samples_per_second": 8.892,
414
+ "eval_steps_per_second": 1.186,
415
  "step": 27
416
  },
417
  {
418
+ "epoch": 1.4736842105263157,
419
+ "grad_norm": 0.6432511806488037,
420
  "learning_rate": 0.0004172826515897146,
421
+ "loss": 0.7978,
422
  "step": 28
423
  },
424
  {
425
+ "epoch": 1.4736842105263157,
426
+ "eval_loss": 0.8845272660255432,
427
+ "eval_runtime": 3.3701,
428
+ "eval_samples_per_second": 8.902,
429
+ "eval_steps_per_second": 1.187,
430
  "step": 28
431
  },
432
  {
433
+ "epoch": 1.526315789473684,
434
+ "grad_norm": 0.6906137466430664,
435
  "learning_rate": 0.00040391536883141455,
436
+ "loss": 0.9925,
437
  "step": 29
438
  },
439
  {
440
+ "epoch": 1.526315789473684,
441
+ "eval_loss": 0.8681280016899109,
442
+ "eval_runtime": 3.368,
443
+ "eval_samples_per_second": 8.907,
444
+ "eval_steps_per_second": 1.188,
445
  "step": 29
446
  },
447
  {
448
+ "epoch": 1.5789473684210527,
449
+ "grad_norm": 0.6398982405662537,
450
  "learning_rate": 0.0003897982258676867,
451
+ "loss": 0.8644,
452
  "step": 30
453
  },
454
  {
455
+ "epoch": 1.5789473684210527,
456
+ "eval_loss": 0.857525110244751,
457
+ "eval_runtime": 3.3617,
458
+ "eval_samples_per_second": 8.924,
459
+ "eval_steps_per_second": 1.19,
460
  "step": 30
461
  },
462
  {
463
+ "epoch": 1.631578947368421,
464
+ "grad_norm": 0.6282161474227905,
465
  "learning_rate": 0.000375,
466
+ "loss": 0.9207,
467
  "step": 31
468
  },
469
  {
470
+ "epoch": 1.631578947368421,
471
+ "eval_loss": 0.8413797616958618,
472
+ "eval_runtime": 3.3632,
473
+ "eval_samples_per_second": 8.92,
474
+ "eval_steps_per_second": 1.189,
475
  "step": 31
476
  },
477
  {
478
+ "epoch": 1.6842105263157894,
479
+ "grad_norm": 0.5699971914291382,
480
  "learning_rate": 0.00035959278669726934,
481
+ "loss": 0.8974,
482
  "step": 32
483
  },
484
  {
485
+ "epoch": 1.6842105263157894,
486
+ "eval_loss": 0.8179092407226562,
487
+ "eval_runtime": 3.3714,
488
+ "eval_samples_per_second": 8.898,
489
+ "eval_steps_per_second": 1.186,
490
  "step": 32
491
  },
492
  {
493
+ "epoch": 1.736842105263158,
494
+ "grad_norm": 0.7283058762550354,
495
  "learning_rate": 0.00034365164835397803,
496
+ "loss": 1.0363,
497
  "step": 33
498
  },
499
  {
500
+ "epoch": 1.736842105263158,
501
+ "eval_loss": 0.8006649017333984,
502
+ "eval_runtime": 3.3726,
503
+ "eval_samples_per_second": 8.895,
504
+ "eval_steps_per_second": 1.186,
505
  "step": 33
506
  },
507
  {
508
+ "epoch": 1.7894736842105263,
509
+ "grad_norm": 0.8358228206634521,
510
  "learning_rate": 0.00032725424859373687,
511
+ "loss": 0.8818,
512
  "step": 34
513
  },
514
  {
515
+ "epoch": 1.7894736842105263,
516
+ "eval_loss": 0.796642005443573,
517
+ "eval_runtime": 3.3722,
518
+ "eval_samples_per_second": 8.896,
519
+ "eval_steps_per_second": 1.186,
520
  "step": 34
521
+ },
522
+ {
523
+ "epoch": 1.8421052631578947,
524
+ "grad_norm": 0.6364978551864624,
525
+ "learning_rate": 0.0003104804738999169,
526
+ "loss": 0.9305,
527
+ "step": 35
528
+ },
529
+ {
530
+ "epoch": 1.8421052631578947,
531
+ "eval_loss": 0.7924755215644836,
532
+ "eval_runtime": 3.3733,
533
+ "eval_samples_per_second": 8.893,
534
+ "eval_steps_per_second": 1.186,
535
+ "step": 35
536
+ },
537
+ {
538
+ "epoch": 1.8947368421052633,
539
+ "grad_norm": 0.8200335502624512,
540
+ "learning_rate": 0.00029341204441673266,
541
+ "loss": 0.8827,
542
+ "step": 36
543
+ },
544
+ {
545
+ "epoch": 1.8947368421052633,
546
+ "eval_loss": 0.7788340449333191,
547
+ "eval_runtime": 3.3722,
548
+ "eval_samples_per_second": 8.896,
549
+ "eval_steps_per_second": 1.186,
550
+ "step": 36
551
+ },
552
+ {
553
+ "epoch": 1.9473684210526314,
554
+ "grad_norm": 0.775111198425293,
555
+ "learning_rate": 0.0002761321158169134,
556
+ "loss": 0.9169,
557
+ "step": 37
558
+ },
559
+ {
560
+ "epoch": 1.9473684210526314,
561
+ "eval_loss": 0.7667044401168823,
562
+ "eval_runtime": 3.3756,
563
+ "eval_samples_per_second": 8.887,
564
+ "eval_steps_per_second": 1.185,
565
+ "step": 37
566
+ },
567
+ {
568
+ "epoch": 2.0,
569
+ "grad_norm": 0.727277934551239,
570
+ "learning_rate": 0.0002587248741756253,
571
+ "loss": 1.0112,
572
+ "step": 38
573
+ },
574
+ {
575
+ "epoch": 2.0,
576
+ "eval_loss": 0.7591570615768433,
577
+ "eval_runtime": 3.3728,
578
+ "eval_samples_per_second": 8.895,
579
+ "eval_steps_per_second": 1.186,
580
+ "step": 38
581
+ },
582
+ {
583
+ "epoch": 2.0526315789473686,
584
+ "grad_norm": 0.5648457407951355,
585
+ "learning_rate": 0.00024127512582437484,
586
+ "loss": 0.8317,
587
+ "step": 39
588
+ },
589
+ {
590
+ "epoch": 2.0526315789473686,
591
+ "eval_loss": 0.7463916540145874,
592
+ "eval_runtime": 3.3708,
593
+ "eval_samples_per_second": 8.9,
594
+ "eval_steps_per_second": 1.187,
595
+ "step": 39
596
+ },
597
+ {
598
+ "epoch": 2.1052631578947367,
599
+ "grad_norm": 0.5476389527320862,
600
+ "learning_rate": 0.00022386788418308668,
601
+ "loss": 0.7733,
602
+ "step": 40
603
+ },
604
+ {
605
+ "epoch": 2.1052631578947367,
606
+ "eval_loss": 0.7394412159919739,
607
+ "eval_runtime": 3.3669,
608
+ "eval_samples_per_second": 8.91,
609
+ "eval_steps_per_second": 1.188,
610
+ "step": 40
611
+ },
612
+ {
613
+ "epoch": 2.1578947368421053,
614
+ "grad_norm": 0.4683343768119812,
615
+ "learning_rate": 0.00020658795558326743,
616
+ "loss": 0.7401,
617
+ "step": 41
618
+ },
619
+ {
620
+ "epoch": 2.1578947368421053,
621
+ "eval_loss": 0.7358477711677551,
622
+ "eval_runtime": 3.3619,
623
+ "eval_samples_per_second": 8.924,
624
+ "eval_steps_per_second": 1.19,
625
+ "step": 41
626
+ },
627
+ {
628
+ "epoch": 2.2105263157894735,
629
+ "grad_norm": 0.6029678583145142,
630
+ "learning_rate": 0.0001895195261000831,
631
+ "loss": 0.6829,
632
+ "step": 42
633
+ },
634
+ {
635
+ "epoch": 2.2105263157894735,
636
+ "eval_loss": 0.7268175482749939,
637
+ "eval_runtime": 3.3732,
638
+ "eval_samples_per_second": 8.894,
639
+ "eval_steps_per_second": 1.186,
640
+ "step": 42
641
+ },
642
+ {
643
+ "epoch": 2.263157894736842,
644
+ "grad_norm": 0.6847506761550903,
645
+ "learning_rate": 0.00017274575140626317,
646
+ "loss": 0.7923,
647
+ "step": 43
648
+ },
649
+ {
650
+ "epoch": 2.263157894736842,
651
+ "eval_loss": 0.7140093445777893,
652
+ "eval_runtime": 3.3766,
653
+ "eval_samples_per_second": 8.885,
654
+ "eval_steps_per_second": 1.185,
655
+ "step": 43
656
+ },
657
+ {
658
+ "epoch": 2.3157894736842106,
659
+ "grad_norm": 0.6127113699913025,
660
+ "learning_rate": 0.00015634835164602198,
661
+ "loss": 0.7396,
662
+ "step": 44
663
+ },
664
+ {
665
+ "epoch": 2.3157894736842106,
666
+ "eval_loss": 0.6983242034912109,
667
+ "eval_runtime": 3.3684,
668
+ "eval_samples_per_second": 8.906,
669
+ "eval_steps_per_second": 1.188,
670
+ "step": 44
671
+ },
672
+ {
673
+ "epoch": 2.3684210526315788,
674
+ "grad_norm": 0.538176953792572,
675
+ "learning_rate": 0.00014040721330273062,
676
+ "loss": 0.6553,
677
+ "step": 45
678
+ },
679
+ {
680
+ "epoch": 2.3684210526315788,
681
+ "eval_loss": 0.6850975155830383,
682
+ "eval_runtime": 3.3723,
683
+ "eval_samples_per_second": 8.896,
684
+ "eval_steps_per_second": 1.186,
685
+ "step": 45
686
+ },
687
+ {
688
+ "epoch": 2.4210526315789473,
689
+ "grad_norm": 0.6419486999511719,
690
+ "learning_rate": 0.00012500000000000006,
691
+ "loss": 0.7364,
692
+ "step": 46
693
+ },
694
+ {
695
+ "epoch": 2.4210526315789473,
696
+ "eval_loss": 0.6766163110733032,
697
+ "eval_runtime": 3.3706,
698
+ "eval_samples_per_second": 8.9,
699
+ "eval_steps_per_second": 1.187,
700
+ "step": 46
701
+ },
702
+ {
703
+ "epoch": 2.473684210526316,
704
+ "grad_norm": 0.5997453331947327,
705
+ "learning_rate": 0.00011020177413231333,
706
+ "loss": 0.6901,
707
+ "step": 47
708
+ },
709
+ {
710
+ "epoch": 2.473684210526316,
711
+ "eval_loss": 0.667664110660553,
712
+ "eval_runtime": 3.3701,
713
+ "eval_samples_per_second": 8.902,
714
+ "eval_steps_per_second": 1.187,
715
+ "step": 47
716
+ },
717
+ {
718
+ "epoch": 2.526315789473684,
719
+ "grad_norm": 0.5617692470550537,
720
+ "learning_rate": 9.608463116858542e-05,
721
+ "loss": 0.6299,
722
+ "step": 48
723
+ },
724
+ {
725
+ "epoch": 2.526315789473684,
726
+ "eval_loss": 0.658656895160675,
727
+ "eval_runtime": 3.3698,
728
+ "eval_samples_per_second": 8.903,
729
+ "eval_steps_per_second": 1.187,
730
+ "step": 48
731
+ },
732
+ {
733
+ "epoch": 2.5789473684210527,
734
+ "grad_norm": 0.5850865840911865,
735
+ "learning_rate": 8.271734841028553e-05,
736
+ "loss": 0.717,
737
+ "step": 49
738
+ },
739
+ {
740
+ "epoch": 2.5789473684210527,
741
+ "eval_loss": 0.6522302627563477,
742
+ "eval_runtime": 3.3689,
743
+ "eval_samples_per_second": 8.905,
744
+ "eval_steps_per_second": 1.187,
745
+ "step": 49
746
+ },
747
+ {
748
+ "epoch": 2.6315789473684212,
749
+ "grad_norm": 0.5645343661308289,
750
+ "learning_rate": 7.016504991533726e-05,
751
+ "loss": 0.6396,
752
+ "step": 50
753
+ },
754
+ {
755
+ "epoch": 2.6315789473684212,
756
+ "eval_loss": 0.6460065841674805,
757
+ "eval_runtime": 3.3689,
758
+ "eval_samples_per_second": 8.905,
759
+ "eval_steps_per_second": 1.187,
760
+ "step": 50
761
  }
762
  ],
763
  "logging_steps": 1,
764
  "max_steps": 60,
765
  "num_input_tokens_seen": 0,
766
  "num_train_epochs": 4,
767
+ "save_steps": 10,
768
  "stateful_callbacks": {
769
  "TrainerControl": {
770
  "args": {
 
777
  "attributes": {}
778
  }
779
  },
780
+ "total_flos": 2071824550963200.0,
781
  "train_batch_size": 1,
782
  "trial_name": null,
783
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35a31c5738d1e04a631eaba235b32a635a2d813fe2fdb0a67056063042a474b2
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edcc177b311a910c114a6f967889ae7a76ba2972b6975c424e0b408727d54675
3
  size 6033