yknxh commited on
Commit
4080030
·
verified ·
1 Parent(s): 8a3fba7

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yknxxh-seoul-national-university/huggingface/runs/zn4j6jmk)
31
 
32
  This model was trained with SFT.
33
 
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yknxxh-seoul-national-university/huggingface/runs/qk077co9)
31
 
32
  This model was trained with SFT.
33
 
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 2.7685804472008704e+16,
4
- "train_loss": 1.0384218392182714,
5
- "train_runtime": 242.5709,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 38.179,
8
- "train_steps_per_second": 2.387
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 2.5856227500097536e+16,
4
+ "train_loss": 1.017678025019103,
5
+ "train_runtime": 224.2528,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 38.568,
8
+ "train_steps_per_second": 2.421
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95a541f3397f3ea70e4797acd153ad3d579b4b029fa3dbbe7c2edb68c03c5457
3
  size 2471645608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e1d80a91a84e41f750f86c8e62de9533f001784a64e4505162111138a38af21
3
  size 2471645608
runs/Feb14_06-44-19_yekyung-nah-0/events.out.tfevents.1739483078.yekyung-nah-0.3735356.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35ab42525d078c3a6d2274c74a43ae0a90a88dfc1bc107d7fa43542431819df7
3
- size 16266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0619ad61ddd9702259f41551a76d2a8d20c1d373543c7fe2cfc85b3cc05f2882
3
+ size 28858
runs/Feb14_06-47-41_yekyung-nah-0/events.out.tfevents.1739483278.yekyung-nah-0.3747757.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1356a3b87766e684961517e4490bb72c4798f8f3eb9a3c37bd1f1e92a3fc45f6
3
+ size 10577
runs/Feb14_06-47-47_yekyung-nah-0/events.out.tfevents.1739483287.yekyung-nah-0.3748246.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f9a74a3bb1e50421899df919e0ee24f13c88d5b1c4097baaf2aed112704f7ef
3
+ size 9956
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 2.7685804472008704e+16,
4
- "train_loss": 1.0384218392182714,
5
- "train_runtime": 242.5709,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 38.179,
8
- "train_steps_per_second": 2.387
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 2.5856227500097536e+16,
4
+ "train_loss": 1.017678025019103,
5
+ "train_runtime": 224.2528,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 38.568,
8
+ "train_steps_per_second": 2.421
9
  }
trainer_state.json CHANGED
@@ -3,835 +3,786 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 579,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0051813471502590676,
13
- "grad_norm": 7.28125,
14
- "learning_rate": 3.4482758620689656e-07,
15
- "loss": 2.3287,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.025906735751295335,
20
- "grad_norm": 7.125,
21
- "learning_rate": 1.724137931034483e-06,
22
- "loss": 2.3778,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.05181347150259067,
27
- "grad_norm": 7.34375,
28
- "learning_rate": 3.448275862068966e-06,
29
- "loss": 2.3462,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.07772020725388601,
34
- "grad_norm": 7.625,
35
- "learning_rate": 5.172413793103449e-06,
36
- "loss": 2.303,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.10362694300518134,
41
- "grad_norm": 6.0625,
42
- "learning_rate": 6.896551724137932e-06,
43
- "loss": 2.2303,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.12953367875647667,
48
- "grad_norm": 6.03125,
49
- "learning_rate": 8.620689655172414e-06,
50
- "loss": 2.1691,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.15544041450777202,
55
- "grad_norm": 3.921875,
56
- "learning_rate": 1.0344827586206898e-05,
57
- "loss": 1.9911,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.18134715025906736,
62
- "grad_norm": 3.3125,
63
- "learning_rate": 1.206896551724138e-05,
64
- "loss": 1.8287,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.20725388601036268,
69
- "grad_norm": 2.5,
70
- "learning_rate": 1.3793103448275863e-05,
71
- "loss": 1.7025,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.23316062176165803,
76
- "grad_norm": 2.3125,
77
- "learning_rate": 1.5517241379310346e-05,
78
- "loss": 1.5651,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.25906735751295334,
83
- "grad_norm": 1.875,
84
- "learning_rate": 1.7241379310344828e-05,
85
- "loss": 1.4727,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.2849740932642487,
90
- "grad_norm": 2.03125,
91
- "learning_rate": 1.896551724137931e-05,
92
- "loss": 1.3068,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.31088082901554404,
97
- "grad_norm": 1.2578125,
98
- "learning_rate": 1.9999272808103276e-05,
99
- "loss": 1.1946,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.33678756476683935,
104
  "grad_norm": 1.0078125,
105
- "learning_rate": 1.9991093113822542e-05,
106
- "loss": 1.1095,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.3626943005181347,
111
- "grad_norm": 0.91015625,
112
- "learning_rate": 1.997383219496441e-05,
113
- "loss": 1.0579,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.38860103626943004,
118
- "grad_norm": 0.73046875,
119
- "learning_rate": 1.9947505740530947e-05,
120
- "loss": 1.0251,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.41450777202072536,
125
- "grad_norm": 0.7421875,
126
- "learning_rate": 1.991213767947991e-05,
127
- "loss": 1.0027,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.44041450777202074,
132
- "grad_norm": 0.71875,
133
- "learning_rate": 1.986776015897494e-05,
134
- "loss": 1.0183,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.46632124352331605,
139
- "grad_norm": 0.67578125,
140
- "learning_rate": 1.9814413515165974e-05,
141
- "loss": 1.0133,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.49222797927461137,
146
- "grad_norm": 0.640625,
147
- "learning_rate": 1.975214623652643e-05,
148
- "loss": 0.9781,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.5181347150259067,
153
- "grad_norm": 0.6640625,
154
- "learning_rate": 1.968101491978049e-05,
155
- "loss": 0.9952,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.5440414507772021,
160
- "grad_norm": 0.6875,
161
- "learning_rate": 1.9601084218460494e-05,
162
- "loss": 0.9748,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.5699481865284974,
167
- "grad_norm": 0.6171875,
168
- "learning_rate": 1.9512426784141307e-05,
169
- "loss": 0.9606,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.5958549222797928,
174
- "grad_norm": 0.62890625,
175
- "learning_rate": 1.9415123200404962e-05,
176
- "loss": 0.985,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.6217616580310881,
181
- "grad_norm": 0.671875,
182
- "learning_rate": 1.930926190959566e-05,
183
- "loss": 0.9631,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.6476683937823834,
188
- "grad_norm": 0.6328125,
189
- "learning_rate": 1.9194939132431682e-05,
190
- "loss": 0.9482,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.6735751295336787,
195
- "grad_norm": 0.609375,
196
- "learning_rate": 1.9072258780547317e-05,
197
- "loss": 0.9733,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.6994818652849741,
202
- "grad_norm": 0.63671875,
203
- "learning_rate": 1.894133236204423e-05,
204
- "loss": 0.9539,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.7253886010362695,
209
- "grad_norm": 0.70703125,
210
- "learning_rate": 1.880227888013818e-05,
211
- "loss": 0.9892,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.7512953367875648,
216
- "grad_norm": 0.6171875,
217
- "learning_rate": 1.8655224724993202e-05,
218
- "loss": 0.9594,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.7772020725388601,
223
- "grad_norm": 0.609375,
224
- "learning_rate": 1.850030355884151e-05,
225
- "loss": 0.9257,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.8031088082901554,
230
- "grad_norm": 0.62109375,
231
- "learning_rate": 1.8337656194493637e-05,
232
- "loss": 0.9688,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.8290155440414507,
237
- "grad_norm": 0.65625,
238
- "learning_rate": 1.8167430467349145e-05,
239
- "loss": 0.9853,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.8549222797927462,
244
- "grad_norm": 0.62109375,
245
- "learning_rate": 1.7989781101024305e-05,
246
- "loss": 0.9375,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.8808290155440415,
251
- "grad_norm": 0.70703125,
252
- "learning_rate": 1.7804869566718833e-05,
253
- "loss": 0.9712,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.9067357512953368,
258
- "grad_norm": 0.6953125,
259
- "learning_rate": 1.7612863936449568e-05,
260
- "loss": 0.951,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.9326424870466321,
265
- "grad_norm": 0.6875,
266
- "learning_rate": 1.7413938730284404e-05,
267
- "loss": 0.9801,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.9585492227979274,
272
- "grad_norm": 0.640625,
273
- "learning_rate": 1.7208274757715425e-05,
274
- "loss": 0.9527,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.9844559585492227,
279
- "grad_norm": 0.61328125,
280
- "learning_rate": 1.6996058953315372e-05,
281
- "loss": 0.9285,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 1.0103626943005182,
286
- "grad_norm": 0.6171875,
287
- "learning_rate": 1.6777484206826793e-05,
288
- "loss": 0.9499,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 1.0362694300518134,
293
- "grad_norm": 0.69921875,
294
- "learning_rate": 1.6552749187838425e-05,
295
- "loss": 0.9556,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 1.0621761658031088,
300
- "grad_norm": 0.640625,
301
- "learning_rate": 1.632205816520799e-05,
302
- "loss": 0.9324,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 1.0880829015544042,
307
- "grad_norm": 0.7265625,
308
- "learning_rate": 1.6085620821395722e-05,
309
- "loss": 0.9479,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 1.1139896373056994,
314
- "grad_norm": 0.57421875,
315
- "learning_rate": 1.5843652061877245e-05,
316
- "loss": 0.9215,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 1.1398963730569949,
321
- "grad_norm": 0.5859375,
322
- "learning_rate": 1.5596371819809104e-05,
323
- "loss": 0.9367,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 1.16580310880829,
328
- "grad_norm": 0.5703125,
329
- "learning_rate": 1.534400485612449e-05,
330
- "loss": 0.91,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 1.1917098445595855,
335
- "grad_norm": 0.63671875,
336
- "learning_rate": 1.5086780555240803e-05,
337
- "loss": 0.9297,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 1.2176165803108807,
342
- "grad_norm": 0.56640625,
343
- "learning_rate": 1.482493271656482e-05,
344
- "loss": 0.9426,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 1.2435233160621761,
349
- "grad_norm": 0.60546875,
350
- "learning_rate": 1.4558699341984928e-05,
351
- "loss": 0.9639,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 1.2694300518134716,
356
- "grad_norm": 0.59765625,
357
- "learning_rate": 1.4288322419543576e-05,
358
- "loss": 0.946,
359
  "step": 245
360
  },
361
  {
362
- "epoch": 1.2953367875647668,
363
- "grad_norm": 0.65234375,
364
- "learning_rate": 1.40140477034866e-05,
365
- "loss": 0.9438,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 1.3212435233160622,
370
- "grad_norm": 0.6015625,
371
- "learning_rate": 1.373612449088931e-05,
372
- "loss": 0.9652,
373
  "step": 255
374
  },
375
  {
376
- "epoch": 1.3471502590673574,
377
- "grad_norm": 0.59765625,
378
- "learning_rate": 1.3454805395062386e-05,
379
- "loss": 0.9245,
380
  "step": 260
381
  },
382
  {
383
- "epoch": 1.3730569948186528,
384
- "grad_norm": 0.66015625,
385
- "learning_rate": 1.3170346115943575e-05,
386
- "loss": 0.9195,
387
  "step": 265
388
  },
389
  {
390
- "epoch": 1.3989637305699483,
391
- "grad_norm": 0.6015625,
392
- "learning_rate": 1.2883005207683784e-05,
393
- "loss": 0.9341,
394
  "step": 270
395
  },
396
  {
397
- "epoch": 1.4248704663212435,
398
- "grad_norm": 0.6015625,
399
- "learning_rate": 1.2593043843638978e-05,
400
- "loss": 0.9238,
401
  "step": 275
402
  },
403
  {
404
- "epoch": 1.450777202072539,
405
- "grad_norm": 0.62109375,
406
- "learning_rate": 1.2300725578981308e-05,
407
- "loss": 0.9271,
408
  "step": 280
409
  },
410
  {
411
- "epoch": 1.4766839378238341,
412
- "grad_norm": 0.58984375,
413
- "learning_rate": 1.2006316111145401e-05,
414
- "loss": 0.9348,
415
  "step": 285
416
  },
417
  {
418
- "epoch": 1.5025906735751295,
419
- "grad_norm": 0.70703125,
420
- "learning_rate": 1.1710083038327436e-05,
421
- "loss": 0.9341,
422
  "step": 290
423
  },
424
  {
425
- "epoch": 1.528497409326425,
426
- "grad_norm": 0.70703125,
427
- "learning_rate": 1.1412295616256577e-05,
428
- "loss": 0.946,
429
  "step": 295
430
  },
431
  {
432
- "epoch": 1.5544041450777202,
433
- "grad_norm": 0.59375,
434
- "learning_rate": 1.1113224513459819e-05,
435
- "loss": 0.9134,
436
  "step": 300
437
  },
438
  {
439
- "epoch": 1.5803108808290154,
440
- "grad_norm": 0.61328125,
441
- "learning_rate": 1.0813141565242682e-05,
442
- "loss": 0.9177,
443
  "step": 305
444
  },
445
  {
446
- "epoch": 1.6062176165803108,
447
- "grad_norm": 0.65625,
448
- "learning_rate": 1.0512319526609405e-05,
449
- "loss": 0.937,
450
  "step": 310
451
  },
452
  {
453
- "epoch": 1.6321243523316062,
454
- "grad_norm": 0.6015625,
455
- "learning_rate": 1.021103182434718e-05,
456
- "loss": 0.9398,
457
  "step": 315
458
  },
459
  {
460
- "epoch": 1.6580310880829017,
461
- "grad_norm": 0.5625,
462
- "learning_rate": 9.909552308499792e-06,
463
- "loss": 0.9289,
464
  "step": 320
465
  },
466
  {
467
- "epoch": 1.6839378238341969,
468
- "grad_norm": 0.71484375,
469
- "learning_rate": 9.608155003456529e-06,
470
- "loss": 0.9075,
471
  "step": 325
472
  },
473
  {
474
- "epoch": 1.709844559585492,
475
- "grad_norm": 0.6328125,
476
- "learning_rate": 9.307113858882664e-06,
477
- "loss": 0.9154,
478
  "step": 330
479
  },
480
  {
481
- "epoch": 1.7357512953367875,
482
- "grad_norm": 0.55859375,
483
- "learning_rate": 9.006702500717786e-06,
484
- "loss": 0.9036,
485
  "step": 335
486
  },
487
  {
488
- "epoch": 1.761658031088083,
489
- "grad_norm": 0.69921875,
490
- "learning_rate": 8.707193982468456e-06,
491
- "loss": 0.9421,
492
  "step": 340
493
  },
494
  {
495
- "epoch": 1.7875647668393784,
496
- "grad_norm": 0.64453125,
497
- "learning_rate": 8.408860537021127e-06,
498
- "loss": 0.9359,
499
  "step": 345
500
  },
501
  {
502
- "epoch": 1.8134715025906736,
503
- "grad_norm": 0.58984375,
504
- "learning_rate": 8.111973329200909e-06,
505
- "loss": 0.9473,
506
  "step": 350
507
  },
508
  {
509
- "epoch": 1.8393782383419688,
510
- "grad_norm": 0.6328125,
511
- "learning_rate": 7.816802209301241e-06,
512
- "loss": 0.9257,
513
  "step": 355
514
  },
515
  {
516
- "epoch": 1.8652849740932642,
517
- "grad_norm": 0.609375,
518
- "learning_rate": 7.523615467808249e-06,
519
- "loss": 0.9486,
520
  "step": 360
521
  },
522
  {
523
- "epoch": 1.8911917098445596,
524
- "grad_norm": 0.640625,
525
- "learning_rate": 7.232679591542979e-06,
526
- "loss": 0.9366,
527
  "step": 365
528
  },
529
  {
530
- "epoch": 1.917098445595855,
531
- "grad_norm": 0.60546875,
532
- "learning_rate": 6.944259021442967e-06,
533
- "loss": 0.9292,
534
  "step": 370
535
  },
536
  {
537
- "epoch": 1.9430051813471503,
538
- "grad_norm": 0.62109375,
539
- "learning_rate": 6.6586159122033914e-06,
540
- "loss": 0.9025,
541
  "step": 375
542
  },
543
  {
544
- "epoch": 1.9689119170984455,
545
- "grad_norm": 0.609375,
546
- "learning_rate": 6.3760098939962935e-06,
547
- "loss": 0.9151,
548
  "step": 380
549
  },
550
  {
551
- "epoch": 1.994818652849741,
552
- "grad_norm": 0.73046875,
553
- "learning_rate": 6.096697836484382e-06,
554
- "loss": 0.9283,
555
  "step": 385
556
  },
557
  {
558
- "epoch": 2.0207253886010363,
559
- "grad_norm": 0.640625,
560
- "learning_rate": 5.820933615343975e-06,
561
- "loss": 0.9249,
562
  "step": 390
563
  },
564
  {
565
- "epoch": 2.0466321243523318,
566
- "grad_norm": 0.5390625,
567
- "learning_rate": 5.548967881509275e-06,
568
- "loss": 0.9119,
569
  "step": 395
570
  },
571
  {
572
- "epoch": 2.0725388601036268,
573
- "grad_norm": 0.60546875,
574
- "learning_rate": 5.281047833347676e-06,
575
- "loss": 0.9236,
576
  "step": 400
577
  },
578
  {
579
- "epoch": 2.098445595854922,
580
- "grad_norm": 0.609375,
581
- "learning_rate": 5.017416991973281e-06,
582
- "loss": 0.9169,
583
  "step": 405
584
  },
585
  {
586
- "epoch": 2.1243523316062176,
587
- "grad_norm": 0.62109375,
588
- "learning_rate": 4.758314979902734e-06,
589
- "loss": 0.9315,
590
  "step": 410
591
  },
592
  {
593
- "epoch": 2.150259067357513,
594
- "grad_norm": 0.5859375,
595
- "learning_rate": 4.503977303254673e-06,
596
- "loss": 0.9111,
597
  "step": 415
598
  },
599
  {
600
- "epoch": 2.1761658031088085,
601
- "grad_norm": 0.62109375,
602
- "learning_rate": 4.25463513769064e-06,
603
- "loss": 0.9495,
604
  "step": 420
605
  },
606
  {
607
- "epoch": 2.2020725388601035,
608
- "grad_norm": 0.59375,
609
- "learning_rate": 4.010515118292127e-06,
610
- "loss": 0.9338,
611
  "step": 425
612
  },
613
  {
614
- "epoch": 2.227979274611399,
615
- "grad_norm": 0.5625,
616
- "learning_rate": 3.771839133564704e-06,
617
- "loss": 0.9028,
618
  "step": 430
619
  },
620
  {
621
- "epoch": 2.2538860103626943,
622
- "grad_norm": 0.625,
623
- "learning_rate": 3.5388241237564337e-06,
624
- "loss": 0.9227,
625
  "step": 435
626
  },
627
  {
628
- "epoch": 2.2797927461139897,
629
- "grad_norm": 0.64453125,
630
- "learning_rate": 3.311681883673937e-06,
631
- "loss": 0.9462,
632
  "step": 440
633
  },
634
  {
635
- "epoch": 2.305699481865285,
636
- "grad_norm": 0.6015625,
637
- "learning_rate": 3.0906188701753127e-06,
638
- "loss": 0.929,
639
  "step": 445
640
  },
641
  {
642
- "epoch": 2.33160621761658,
643
- "grad_norm": 0.609375,
644
- "learning_rate": 2.875836014514867e-06,
645
- "loss": 0.9178,
646
  "step": 450
647
  },
648
  {
649
- "epoch": 2.3575129533678756,
650
- "grad_norm": 0.58203125,
651
- "learning_rate": 2.6675285397102856e-06,
652
- "loss": 0.9052,
653
  "step": 455
654
  },
655
  {
656
- "epoch": 2.383419689119171,
657
- "grad_norm": 0.59375,
658
- "learning_rate": 2.465885783098166e-06,
659
- "loss": 0.9019,
660
  "step": 460
661
  },
662
  {
663
- "epoch": 2.4093264248704664,
664
- "grad_norm": 0.6015625,
665
- "learning_rate": 2.2710910242392468e-06,
666
- "loss": 0.9393,
667
  "step": 465
668
  },
669
  {
670
- "epoch": 2.4352331606217614,
671
- "grad_norm": 0.6328125,
672
- "learning_rate": 2.0833213183297475e-06,
673
- "loss": 0.9261,
674
  "step": 470
675
  },
676
  {
677
- "epoch": 2.461139896373057,
678
- "grad_norm": 0.72265625,
679
- "learning_rate": 1.9027473352702208e-06,
680
- "loss": 0.9021,
681
  "step": 475
682
  },
683
  {
684
- "epoch": 2.4870466321243523,
685
- "grad_norm": 0.5625,
686
- "learning_rate": 1.729533204538224e-06,
687
- "loss": 0.9289,
688
  "step": 480
689
  },
690
  {
691
- "epoch": 2.5129533678756477,
692
- "grad_norm": 0.625,
693
- "learning_rate": 1.563836366005782e-06,
694
- "loss": 0.9077,
695
  "step": 485
696
  },
697
  {
698
- "epoch": 2.538860103626943,
699
- "grad_norm": 0.625,
700
- "learning_rate": 1.4058074268372224e-06,
701
- "loss": 0.9668,
702
  "step": 490
703
  },
704
  {
705
- "epoch": 2.5647668393782386,
706
- "grad_norm": 0.59765625,
707
- "learning_rate": 1.2555900245975262e-06,
708
- "loss": 0.9356,
709
  "step": 495
710
  },
711
  {
712
- "epoch": 2.5906735751295336,
713
- "grad_norm": 0.5390625,
714
- "learning_rate": 1.1133206966955213e-06,
715
- "loss": 0.9311,
716
  "step": 500
717
  },
718
  {
719
- "epoch": 2.616580310880829,
720
- "grad_norm": 0.5703125,
721
- "learning_rate": 9.79128756280675e-07,
722
- "loss": 0.9411,
723
  "step": 505
724
  },
725
  {
726
- "epoch": 2.6424870466321244,
727
- "grad_norm": 0.69140625,
728
- "learning_rate": 8.531361747062272e-07,
729
- "loss": 0.9778,
730
  "step": 510
731
  },
732
  {
733
- "epoch": 2.66839378238342,
734
- "grad_norm": 0.57421875,
735
- "learning_rate": 7.354574706655038e-07,
736
- "loss": 0.9099,
737
  "step": 515
738
  },
739
  {
740
- "epoch": 2.694300518134715,
741
- "grad_norm": 0.578125,
742
- "learning_rate": 6.261996061022335e-07,
743
- "loss": 0.9045,
744
  "step": 520
745
  },
746
  {
747
- "epoch": 2.7202072538860103,
748
- "grad_norm": 0.58203125,
749
- "learning_rate": 5.254618889893858e-07,
750
- "loss": 0.8916,
751
  "step": 525
752
  },
753
  {
754
- "epoch": 2.7461139896373057,
755
- "grad_norm": 0.61328125,
756
- "learning_rate": 4.3333588306499584e-07,
757
- "loss": 0.9151,
758
  "step": 530
759
  },
760
  {
761
- "epoch": 2.772020725388601,
762
- "grad_norm": 0.5703125,
763
- "learning_rate": 3.499053246069362e-07,
764
- "loss": 0.9418,
765
  "step": 535
766
  },
767
  {
768
- "epoch": 2.7979274611398965,
769
- "grad_norm": 0.59765625,
770
- "learning_rate": 2.7524604632233054e-07,
771
- "loss": 0.9384,
772
  "step": 540
773
  },
774
- {
775
- "epoch": 2.823834196891192,
776
- "grad_norm": 0.59375,
777
- "learning_rate": 2.0942590842078503e-07,
778
- "loss": 0.9117,
779
- "step": 545
780
- },
781
- {
782
- "epoch": 2.849740932642487,
783
- "grad_norm": 0.6328125,
784
- "learning_rate": 1.5250473693406486e-07,
785
- "loss": 0.9154,
786
- "step": 550
787
- },
788
- {
789
- "epoch": 2.8756476683937824,
790
- "grad_norm": 0.6328125,
791
- "learning_rate": 1.0453426933830002e-07,
792
- "loss": 0.942,
793
- "step": 555
794
- },
795
- {
796
- "epoch": 2.901554404145078,
797
- "grad_norm": 0.609375,
798
- "learning_rate": 6.555810752813308e-08,
799
- "loss": 0.9441,
800
- "step": 560
801
- },
802
- {
803
- "epoch": 2.927461139896373,
804
- "grad_norm": 0.59765625,
805
- "learning_rate": 3.5611678185563106e-08,
806
- "loss": 0.9353,
807
- "step": 565
808
- },
809
- {
810
- "epoch": 2.9533678756476682,
811
- "grad_norm": 0.5859375,
812
- "learning_rate": 1.4722200579497803e-08,
813
- "loss": 0.9235,
814
- "step": 570
815
- },
816
- {
817
- "epoch": 2.9792746113989637,
818
- "grad_norm": 0.59375,
819
- "learning_rate": 2.9086618252893717e-09,
820
- "loss": 0.9246,
821
- "step": 575
822
- },
823
  {
824
  "epoch": 3.0,
825
- "step": 579,
826
- "total_flos": 2.7685804472008704e+16,
827
- "train_loss": 1.0384218392182714,
828
- "train_runtime": 242.5709,
829
- "train_samples_per_second": 38.179,
830
- "train_steps_per_second": 2.387
831
  }
832
  ],
833
  "logging_steps": 5,
834
- "max_steps": 579,
835
  "num_input_tokens_seen": 0,
836
  "num_train_epochs": 3,
837
  "save_steps": 500,
@@ -847,7 +798,7 @@
847
  "attributes": {}
848
  }
849
  },
850
- "total_flos": 2.7685804472008704e+16,
851
  "train_batch_size": 16,
852
  "trial_name": null,
853
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 543,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0055248618784530384,
13
+ "grad_norm": 7.4375,
14
+ "learning_rate": 3.6363636363636366e-07,
15
+ "loss": 2.4042,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.027624309392265192,
20
+ "grad_norm": 7.59375,
21
+ "learning_rate": 1.8181818181818183e-06,
22
+ "loss": 2.4209,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.055248618784530384,
27
+ "grad_norm": 7.1875,
28
+ "learning_rate": 3.6363636363636366e-06,
29
+ "loss": 2.4119,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.08287292817679558,
34
+ "grad_norm": 7.03125,
35
+ "learning_rate": 5.4545454545454545e-06,
36
+ "loss": 2.3657,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.11049723756906077,
41
+ "grad_norm": 7.28125,
42
+ "learning_rate": 7.272727272727273e-06,
43
+ "loss": 2.3123,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.13812154696132597,
48
+ "grad_norm": 5.65625,
49
+ "learning_rate": 9.090909090909091e-06,
50
+ "loss": 2.1773,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.16574585635359115,
55
+ "grad_norm": 3.84375,
56
+ "learning_rate": 1.0909090909090909e-05,
57
+ "loss": 2.0066,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.19337016574585636,
62
+ "grad_norm": 3.0,
63
+ "learning_rate": 1.2727272727272728e-05,
64
+ "loss": 1.9028,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.22099447513812154,
69
+ "grad_norm": 2.421875,
70
+ "learning_rate": 1.4545454545454546e-05,
71
+ "loss": 1.7204,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.24861878453038674,
76
+ "grad_norm": 2.109375,
77
+ "learning_rate": 1.6363636363636366e-05,
78
+ "loss": 1.5881,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.27624309392265195,
83
+ "grad_norm": 1.8125,
84
+ "learning_rate": 1.8181818181818182e-05,
85
+ "loss": 1.4278,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.30386740331491713,
90
+ "grad_norm": 1.71875,
91
+ "learning_rate": 2e-05,
92
+ "loss": 1.2964,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.3314917127071823,
97
+ "grad_norm": 1.2109375,
98
+ "learning_rate": 1.9994819965926346e-05,
99
+ "loss": 1.1979,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.35911602209944754,
104
  "grad_norm": 1.0078125,
105
+ "learning_rate": 1.997928523025598e-05,
106
+ "loss": 1.1192,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.3867403314917127,
111
+ "grad_norm": 0.76171875,
112
+ "learning_rate": 1.9953411887080917e-05,
113
+ "loss": 1.0823,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.4143646408839779,
118
+ "grad_norm": 0.70703125,
119
+ "learning_rate": 1.9917226741361014e-05,
120
+ "loss": 1.0383,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.4419889502762431,
125
+ "grad_norm": 0.69921875,
126
+ "learning_rate": 1.987076728115383e-05,
127
+ "loss": 1.0168,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.4696132596685083,
132
+ "grad_norm": 0.68359375,
133
+ "learning_rate": 1.9814081638776743e-05,
134
+ "loss": 1.004,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.4972375690607735,
139
+ "grad_norm": 0.7109375,
140
+ "learning_rate": 1.9747228540941555e-05,
141
+ "loss": 0.993,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.5248618784530387,
146
+ "grad_norm": 0.765625,
147
+ "learning_rate": 1.9670277247913205e-05,
148
+ "loss": 0.9644,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.5524861878453039,
153
+ "grad_norm": 0.76953125,
154
+ "learning_rate": 1.958330748175568e-05,
155
+ "loss": 0.996,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.580110497237569,
160
+ "grad_norm": 0.734375,
161
+ "learning_rate": 1.948640934373939e-05,
162
+ "loss": 0.9704,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.6077348066298343,
167
+ "grad_norm": 0.76953125,
168
+ "learning_rate": 1.9379683220995657e-05,
169
+ "loss": 0.9572,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.6353591160220995,
174
+ "grad_norm": 0.86328125,
175
+ "learning_rate": 1.9263239682514953e-05,
176
+ "loss": 0.9553,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.6629834254143646,
181
+ "grad_norm": 0.75,
182
+ "learning_rate": 1.9137199364596673e-05,
183
+ "loss": 0.9454,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.6906077348066298,
188
+ "grad_norm": 0.75390625,
189
+ "learning_rate": 1.9001692845869113e-05,
190
+ "loss": 0.939,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.7182320441988951,
195
+ "grad_norm": 0.77734375,
196
+ "learning_rate": 1.8856860512009115e-05,
197
+ "loss": 0.9433,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.7458563535911602,
202
+ "grad_norm": 0.76953125,
203
+ "learning_rate": 1.8702852410301556e-05,
204
+ "loss": 0.9329,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.7734806629834254,
209
+ "grad_norm": 0.71484375,
210
+ "learning_rate": 1.853982809418932e-05,
211
+ "loss": 0.9416,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.8011049723756906,
216
+ "grad_norm": 0.7265625,
217
+ "learning_rate": 1.8367956457974872e-05,
218
+ "loss": 0.914,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.8287292817679558,
223
+ "grad_norm": 0.87890625,
224
+ "learning_rate": 1.8187415561844586e-05,
225
+ "loss": 0.9229,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.856353591160221,
230
+ "grad_norm": 0.828125,
231
+ "learning_rate": 1.7998392447397197e-05,
232
+ "loss": 0.9259,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.8839779005524862,
237
+ "grad_norm": 0.85546875,
238
+ "learning_rate": 1.7801082943867406e-05,
239
+ "loss": 0.9421,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.9116022099447514,
244
+ "grad_norm": 0.8046875,
245
+ "learning_rate": 1.7595691465245484e-05,
246
+ "loss": 0.9225,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.9392265193370166,
251
+ "grad_norm": 0.7109375,
252
+ "learning_rate": 1.7382430798502977e-05,
253
+ "loss": 0.9066,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.9668508287292817,
258
+ "grad_norm": 0.83984375,
259
+ "learning_rate": 1.7161521883143936e-05,
260
+ "loss": 0.8903,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.994475138121547,
265
+ "grad_norm": 0.875,
266
+ "learning_rate": 1.693319358231011e-05,
267
+ "loss": 0.9252,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 1.022099447513812,
272
+ "grad_norm": 0.8203125,
273
+ "learning_rate": 1.6697682445677158e-05,
274
+ "loss": 0.9035,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 1.0497237569060773,
279
+ "grad_norm": 0.69140625,
280
+ "learning_rate": 1.6455232464387587e-05,
281
+ "loss": 0.9036,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 1.0773480662983426,
286
+ "grad_norm": 0.71875,
287
+ "learning_rate": 1.6206094818274228e-05,
288
+ "loss": 0.8932,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 1.1049723756906078,
293
+ "grad_norm": 0.91796875,
294
+ "learning_rate": 1.595052761563627e-05,
295
+ "loss": 0.9065,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 1.132596685082873,
300
+ "grad_norm": 0.859375,
301
+ "learning_rate": 1.5688795625837274e-05,
302
+ "loss": 0.8995,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 1.160220994475138,
307
+ "grad_norm": 0.78125,
308
+ "learning_rate": 1.542117000500229e-05,
309
+ "loss": 0.8844,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 1.1878453038674033,
314
+ "grad_norm": 0.765625,
315
+ "learning_rate": 1.5147928015098309e-05,
316
+ "loss": 0.8894,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 1.2154696132596685,
321
+ "grad_norm": 0.81640625,
322
+ "learning_rate": 1.4869352736688938e-05,
323
+ "loss": 0.894,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 1.2430939226519337,
328
+ "grad_norm": 0.76953125,
329
+ "learning_rate": 1.458573277566103e-05,
330
+ "loss": 0.9222,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 1.270718232044199,
335
+ "grad_norm": 0.75390625,
336
+ "learning_rate": 1.4297361964227004e-05,
337
+ "loss": 0.9014,
338
  "step": 230
339
  },
340
  {
341
+ "epoch": 1.298342541436464,
342
+ "grad_norm": 0.8125,
343
+ "learning_rate": 1.4004539056512667e-05,
344
+ "loss": 0.9052,
345
  "step": 235
346
  },
347
  {
348
+ "epoch": 1.3259668508287292,
349
+ "grad_norm": 0.87109375,
350
+ "learning_rate": 1.3707567419045926e-05,
351
+ "loss": 0.894,
352
  "step": 240
353
  },
354
  {
355
+ "epoch": 1.3535911602209945,
356
+ "grad_norm": 0.8125,
357
+ "learning_rate": 1.3406754716466978e-05,
358
+ "loss": 0.9045,
359
  "step": 245
360
  },
361
  {
362
+ "epoch": 1.3812154696132597,
363
+ "grad_norm": 0.78515625,
364
+ "learning_rate": 1.3102412592785654e-05,
365
+ "loss": 0.8737,
366
  "step": 250
367
  },
368
  {
369
+ "epoch": 1.408839779005525,
370
+ "grad_norm": 0.7734375,
371
+ "learning_rate": 1.2794856348516095e-05,
372
+ "loss": 0.9029,
373
  "step": 255
374
  },
375
  {
376
+ "epoch": 1.43646408839779,
377
+ "grad_norm": 0.83984375,
378
+ "learning_rate": 1.248440461402328e-05,
379
+ "loss": 0.8883,
380
  "step": 260
381
  },
382
  {
383
+ "epoch": 1.4640883977900552,
384
+ "grad_norm": 0.96875,
385
+ "learning_rate": 1.2171379019419786e-05,
386
+ "loss": 0.8932,
387
  "step": 265
388
  },
389
  {
390
+ "epoch": 1.4917127071823204,
391
+ "grad_norm": 0.890625,
392
+ "learning_rate": 1.1856103861354809e-05,
393
+ "loss": 0.8917,
394
  "step": 270
395
  },
396
  {
397
+ "epoch": 1.5193370165745856,
398
+ "grad_norm": 0.78515625,
399
+ "learning_rate": 1.153890576704062e-05,
400
+ "loss": 0.9033,
401
  "step": 275
402
  },
403
  {
404
+ "epoch": 1.5469613259668509,
405
+ "grad_norm": 0.8046875,
406
+ "learning_rate": 1.1220113355864549e-05,
407
+ "loss": 0.8839,
408
  "step": 280
409
  },
410
  {
411
+ "epoch": 1.5745856353591159,
412
+ "grad_norm": 0.8046875,
413
+ "learning_rate": 1.0900056898937055e-05,
414
+ "loss": 0.8887,
415
  "step": 285
416
  },
417
  {
418
+ "epoch": 1.6022099447513813,
419
+ "grad_norm": 0.8046875,
420
+ "learning_rate": 1.0579067976928614e-05,
421
+ "loss": 0.8951,
422
  "step": 290
423
  },
424
  {
425
+ "epoch": 1.6298342541436464,
426
+ "grad_norm": 0.78125,
427
+ "learning_rate": 1.0257479136549889e-05,
428
+ "loss": 0.8954,
429
  "step": 295
430
  },
431
  {
432
+ "epoch": 1.6574585635359116,
433
+ "grad_norm": 0.8359375,
434
+ "learning_rate": 9.935623546031043e-06,
435
+ "loss": 0.9004,
436
  "step": 300
437
  },
438
  {
439
+ "epoch": 1.6850828729281768,
440
+ "grad_norm": 0.85546875,
441
+ "learning_rate": 9.613834649957216e-06,
442
+ "loss": 0.9045,
443
  "step": 305
444
  },
445
  {
446
+ "epoch": 1.7127071823204418,
447
+ "grad_norm": 0.765625,
448
+ "learning_rate": 9.292445823817647e-06,
449
+ "loss": 0.8737,
450
  "step": 310
451
  },
452
  {
453
+ "epoch": 1.7403314917127073,
454
+ "grad_norm": 0.8203125,
455
+ "learning_rate": 8.971790028626395e-06,
456
+ "loss": 0.8722,
457
  "step": 315
458
  },
459
  {
460
+ "epoch": 1.7679558011049723,
461
+ "grad_norm": 0.921875,
462
+ "learning_rate": 8.652199465972462e-06,
463
+ "loss": 0.8995,
464
  "step": 320
465
  },
466
  {
467
+ "epoch": 1.7955801104972375,
468
+ "grad_norm": 0.8203125,
469
+ "learning_rate": 8.334005233856681e-06,
470
+ "loss": 0.9114,
471
  "step": 325
472
  },
473
  {
474
+ "epoch": 1.8232044198895028,
475
+ "grad_norm": 0.79296875,
476
+ "learning_rate": 8.017536983671929e-06,
477
+ "loss": 0.891,
478
  "step": 330
479
  },
480
  {
481
+ "epoch": 1.850828729281768,
482
+ "grad_norm": 0.75,
483
+ "learning_rate": 7.703122578682047e-06,
484
+ "loss": 0.8875,
485
  "step": 335
486
  },
487
  {
488
+ "epoch": 1.8784530386740332,
489
+ "grad_norm": 0.8125,
490
+ "learning_rate": 7.391087754353252e-06,
491
+ "loss": 0.8779,
492
  "step": 340
493
  },
494
  {
495
+ "epoch": 1.9060773480662982,
496
+ "grad_norm": 0.76953125,
497
+ "learning_rate": 7.081755780889978e-06,
498
+ "loss": 0.885,
499
  "step": 345
500
  },
501
  {
502
+ "epoch": 1.9337016574585635,
503
+ "grad_norm": 0.828125,
504
+ "learning_rate": 6.7754471283247594e-06,
505
+ "loss": 0.8875,
506
  "step": 350
507
  },
508
  {
509
+ "epoch": 1.9613259668508287,
510
+ "grad_norm": 0.9140625,
511
+ "learning_rate": 6.472479134509052e-06,
512
+ "loss": 0.9037,
513
  "step": 355
514
  },
515
  {
516
+ "epoch": 1.988950276243094,
517
+ "grad_norm": 0.83203125,
518
+ "learning_rate": 6.173165676349103e-06,
519
+ "loss": 0.8817,
520
  "step": 360
521
  },
522
  {
523
+ "epoch": 2.016574585635359,
524
+ "grad_norm": 0.81640625,
525
+ "learning_rate": 5.8778168446273045e-06,
526
+ "loss": 0.8876,
527
  "step": 365
528
  },
529
  {
530
+ "epoch": 2.044198895027624,
531
+ "grad_norm": 0.77734375,
532
+ "learning_rate": 5.586738622746042e-06,
533
+ "loss": 0.891,
534
  "step": 370
535
  },
536
  {
537
+ "epoch": 2.0718232044198897,
538
+ "grad_norm": 0.80078125,
539
+ "learning_rate": 5.300232569726805e-06,
540
+ "loss": 0.8843,
541
  "step": 375
542
  },
543
  {
544
+ "epoch": 2.0994475138121547,
545
+ "grad_norm": 0.9140625,
546
+ "learning_rate": 5.0185955077929774e-06,
547
+ "loss": 0.8696,
548
  "step": 380
549
  },
550
  {
551
+ "epoch": 2.12707182320442,
552
+ "grad_norm": 0.7578125,
553
+ "learning_rate": 4.742119214860009e-06,
554
+ "loss": 0.8775,
555
  "step": 385
556
  },
557
  {
558
+ "epoch": 2.154696132596685,
559
+ "grad_norm": 0.7578125,
560
+ "learning_rate": 4.471090122251496e-06,
561
+ "loss": 0.8797,
562
  "step": 390
563
  },
564
  {
565
+ "epoch": 2.18232044198895,
566
+ "grad_norm": 0.78125,
567
+ "learning_rate": 4.205789017954364e-06,
568
+ "loss": 0.8832,
569
  "step": 395
570
  },
571
  {
572
+ "epoch": 2.2099447513812156,
573
+ "grad_norm": 0.859375,
574
+ "learning_rate": 3.946490755720621e-06,
575
+ "loss": 0.884,
576
  "step": 400
577
  },
578
  {
579
+ "epoch": 2.2375690607734806,
580
+ "grad_norm": 0.9453125,
581
+ "learning_rate": 3.6934639703169905e-06,
582
+ "loss": 0.8737,
583
  "step": 405
584
  },
585
  {
586
+ "epoch": 2.265193370165746,
587
+ "grad_norm": 0.86328125,
588
+ "learning_rate": 3.4469707992174607e-06,
589
+ "loss": 0.8981,
590
  "step": 410
591
  },
592
  {
593
+ "epoch": 2.292817679558011,
594
+ "grad_norm": 0.80078125,
595
+ "learning_rate": 3.207266611027069e-06,
596
+ "loss": 0.8736,
597
  "step": 415
598
  },
599
  {
600
+ "epoch": 2.320441988950276,
601
+ "grad_norm": 0.78125,
602
+ "learning_rate": 2.97459974091831e-06,
603
+ "loss": 0.8757,
604
  "step": 420
605
  },
606
  {
607
+ "epoch": 2.3480662983425415,
608
+ "grad_norm": 0.8203125,
609
+ "learning_rate": 2.7492112333541744e-06,
610
+ "loss": 0.902,
611
  "step": 425
612
  },
613
  {
614
+ "epoch": 2.3756906077348066,
615
+ "grad_norm": 0.90625,
616
+ "learning_rate": 2.531334592364457e-06,
617
+ "loss": 0.8766,
618
  "step": 430
619
  },
620
  {
621
+ "epoch": 2.403314917127072,
622
+ "grad_norm": 0.7734375,
623
+ "learning_rate": 2.3211955396340003e-06,
624
+ "loss": 0.8982,
625
  "step": 435
626
  },
627
  {
628
+ "epoch": 2.430939226519337,
629
+ "grad_norm": 0.74609375,
630
+ "learning_rate": 2.1190117806534714e-06,
631
+ "loss": 0.8801,
632
  "step": 440
633
  },
634
  {
635
+ "epoch": 2.458563535911602,
636
+ "grad_norm": 0.828125,
637
+ "learning_rate": 1.924992779174999e-06,
638
+ "loss": 0.8707,
639
  "step": 445
640
  },
641
  {
642
+ "epoch": 2.4861878453038675,
643
+ "grad_norm": 0.859375,
644
+ "learning_rate": 1.7393395402063085e-06,
645
+ "loss": 0.8939,
646
  "step": 450
647
  },
648
  {
649
+ "epoch": 2.5138121546961325,
650
+ "grad_norm": 0.859375,
651
+ "learning_rate": 1.5622444017681438e-06,
652
+ "loss": 0.8707,
653
  "step": 455
654
  },
655
  {
656
+ "epoch": 2.541436464088398,
657
+ "grad_norm": 0.78515625,
658
+ "learning_rate": 1.3938908356307846e-06,
659
+ "loss": 0.8771,
660
  "step": 460
661
  },
662
  {
663
+ "epoch": 2.569060773480663,
664
+ "grad_norm": 0.76171875,
665
+ "learning_rate": 1.2344532572360325e-06,
666
+ "loss": 0.857,
667
  "step": 465
668
  },
669
  {
670
+ "epoch": 2.596685082872928,
671
+ "grad_norm": 0.83984375,
672
+ "learning_rate": 1.0840968450016276e-06,
673
+ "loss": 0.885,
674
  "step": 470
675
  },
676
  {
677
+ "epoch": 2.6243093922651934,
678
+ "grad_norm": 0.78125,
679
+ "learning_rate": 9.42977369195286e-07,
680
+ "loss": 0.9007,
681
  "step": 475
682
  },
683
  {
684
+ "epoch": 2.6519337016574585,
685
+ "grad_norm": 0.90234375,
686
+ "learning_rate": 8.112410305556307e-07,
687
+ "loss": 0.8988,
688
  "step": 480
689
  },
690
  {
691
+ "epoch": 2.679558011049724,
692
+ "grad_norm": 0.78125,
693
+ "learning_rate": 6.890243088272453e-07,
694
+ "loss": 0.8702,
695
  "step": 485
696
  },
697
  {
698
+ "epoch": 2.707182320441989,
699
+ "grad_norm": 0.875,
700
+ "learning_rate": 5.764538213667103e-07,
701
+ "loss": 0.8981,
702
  "step": 490
703
  },
704
  {
705
+ "epoch": 2.734806629834254,
706
+ "grad_norm": 0.89453125,
707
+ "learning_rate": 4.73646191966175e-07,
708
+ "loss": 0.8912,
709
  "step": 495
710
  },
711
  {
712
+ "epoch": 2.7624309392265194,
713
+ "grad_norm": 0.875,
714
+ "learning_rate": 3.8070793003030296e-07,
715
+ "loss": 0.8967,
716
  "step": 500
717
  },
718
  {
719
+ "epoch": 2.7900552486187844,
720
+ "grad_norm": 0.828125,
721
+ "learning_rate": 2.9773532023180897e-07,
722
+ "loss": 0.8933,
723
  "step": 505
724
  },
725
  {
726
+ "epoch": 2.81767955801105,
727
+ "grad_norm": 0.7578125,
728
+ "learning_rate": 2.248143227598809e-07,
729
+ "loss": 0.9039,
730
  "step": 510
731
  },
732
  {
733
+ "epoch": 2.845303867403315,
734
+ "grad_norm": 0.7578125,
735
+ "learning_rate": 1.6202048426483652e-07,
736
+ "loss": 0.8864,
737
  "step": 515
738
  },
739
  {
740
+ "epoch": 2.87292817679558,
741
+ "grad_norm": 0.8515625,
742
+ "learning_rate": 1.094188595912804e-07,
743
+ "loss": 0.8997,
744
  "step": 520
745
  },
746
  {
747
+ "epoch": 2.9005524861878453,
748
+ "grad_norm": 0.87109375,
749
+ "learning_rate": 6.706394438083962e-08,
750
+ "loss": 0.8805,
751
  "step": 525
752
  },
753
  {
754
+ "epoch": 2.9281767955801103,
755
+ "grad_norm": 0.7890625,
756
+ "learning_rate": 3.4999618614309784e-08,
757
+ "loss": 0.8959,
758
  "step": 530
759
  },
760
  {
761
+ "epoch": 2.955801104972376,
762
+ "grad_norm": 0.7265625,
763
+ "learning_rate": 1.325910115169471e-08,
764
+ "loss": 0.8552,
765
  "step": 535
766
  },
767
  {
768
+ "epoch": 2.983425414364641,
769
+ "grad_norm": 0.82421875,
770
+ "learning_rate": 1.8649153172423106e-09,
771
+ "loss": 0.8753,
772
  "step": 540
773
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  {
775
  "epoch": 3.0,
776
+ "step": 543,
777
+ "total_flos": 2.5856227500097536e+16,
778
+ "train_loss": 1.017678025019103,
779
+ "train_runtime": 224.2528,
780
+ "train_samples_per_second": 38.568,
781
+ "train_steps_per_second": 2.421
782
  }
783
  ],
784
  "logging_steps": 5,
785
+ "max_steps": 543,
786
  "num_input_tokens_seen": 0,
787
  "num_train_epochs": 3,
788
  "save_steps": 500,
 
798
  "attributes": {}
799
  }
800
  },
801
+ "total_flos": 2.5856227500097536e+16,
802
  "train_batch_size": 16,
803
  "trial_name": null,
804
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fa81c0842694b8297e4996a6f1dcfc12daedf7a7564e339a703a7f1e1ac1a96
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96799ddaa0622d20b44b5fd1f13fd4ee51acc21d18c3376a5108d307ef6d1ed1
3
  size 5560