terry69 commited on
Commit
4a1239d
·
verified ·
1 Parent(s): ad7c80d

Model save

Browse files
Files changed (4) hide show
  1. README.md +6 -7
  2. all_results.json +6 -11
  3. train_results.json +6 -6
  4. trainer_state.json +407 -344
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: llama3
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: meta-llama/Meta-Llama-3-8B
10
  datasets:
11
- - preference-data
12
  model-index:
13
  - name: downstream_0.1p_seed42_level2_syntax
14
  results: []
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # downstream_0.1p_seed42_level2_syntax
21
 
22
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the preference-data dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.0375
25
 
26
  ## Model description
27
 
@@ -56,9 +55,9 @@ The following hyperparameters were used during training:
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:------:|:----:|:---------------:|
61
- | 1.1119 | 0.9994 | 408 | 1.0375 |
62
 
63
 
64
  ### Framework versions
 
2
  license: llama3
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: meta-llama/Meta-Llama-3-8B
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: downstream_0.1p_seed42_level2_syntax
13
  results: []
 
18
 
19
  # downstream_0.1p_seed42_level2_syntax
20
 
21
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0367
24
 
25
  ## Model description
26
 
 
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.094 | 1.0 | 454 | 1.0367 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9993876301285977,
3
- "eval_loss": 1.037530541419983,
4
- "eval_runtime": 2.3375,
5
- "eval_samples": 10,
6
- "eval_samples_per_second": 2.995,
7
- "eval_steps_per_second": 0.856,
8
- "total_flos": 1.2948113606049792e+16,
9
- "train_loss": 1.1371603935372596,
10
- "train_runtime": 15464.5644,
11
- "train_samples": 90000,
12
- "train_samples_per_second": 3.378,
13
  "train_steps_per_second": 0.026
14
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.4408836278386688e+16,
4
+ "train_loss": 1.131237539688396,
5
+ "train_runtime": 17315.3957,
6
+ "train_samples": 100000,
7
+ "train_samples_per_second": 3.356,
 
 
 
 
 
8
  "train_steps_per_second": 0.026
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9993876301285977,
3
- "total_flos": 1.2948113606049792e+16,
4
- "train_loss": 1.1371603935372596,
5
- "train_runtime": 15464.5644,
6
- "train_samples": 90000,
7
- "train_samples_per_second": 3.378,
8
  "train_steps_per_second": 0.026
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.4408836278386688e+16,
4
+ "train_loss": 1.131237539688396,
5
+ "train_runtime": 17315.3957,
6
+ "train_samples": 100000,
7
+ "train_samples_per_second": 3.356,
8
  "train_steps_per_second": 0.026
9
  }
trainer_state.json CHANGED
@@ -1,607 +1,670 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9993876301285977,
5
  "eval_steps": 500,
6
- "global_step": 408,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.002449479485609308,
13
- "grad_norm": 1.0681437691408622,
14
- "learning_rate": 4.8780487804878055e-06,
15
- "loss": 1.4232,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.01224739742804654,
20
- "grad_norm": 1.136306986221139,
21
- "learning_rate": 2.4390243902439026e-05,
22
- "loss": 1.3933,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.02449479485609308,
27
- "grad_norm": 0.245338434807422,
28
- "learning_rate": 4.878048780487805e-05,
29
- "loss": 1.3421,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.03674219228413962,
34
- "grad_norm": 0.2375193035213865,
35
- "learning_rate": 7.317073170731707e-05,
36
- "loss": 1.3138,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.04898958971218616,
41
- "grad_norm": 0.18791567927506073,
42
- "learning_rate": 9.75609756097561e-05,
43
- "loss": 1.2873,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.0612369871402327,
48
- "grad_norm": 0.12816603940375418,
49
- "learning_rate": 0.00012195121951219512,
50
- "loss": 1.2549,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.07348438456827924,
55
- "grad_norm": 0.11662240238108945,
56
- "learning_rate": 0.00014634146341463414,
57
- "loss": 1.2099,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.08573178199632578,
62
- "grad_norm": 0.08250874631468892,
63
- "learning_rate": 0.0001707317073170732,
64
- "loss": 1.1902,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.09797917942437231,
69
- "grad_norm": 0.09447143113709522,
70
- "learning_rate": 0.0001951219512195122,
71
- "loss": 1.2005,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.11022657685241886,
76
- "grad_norm": 0.0824021009170569,
77
- "learning_rate": 0.00019994138413588491,
78
- "loss": 1.1864,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.1224739742804654,
83
- "grad_norm": 0.0894227247413217,
84
- "learning_rate": 0.0001997033749537941,
85
- "loss": 1.1574,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.13472137170851195,
90
- "grad_norm": 0.07854947144546601,
91
- "learning_rate": 0.00019928274457498818,
92
- "loss": 1.1595,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.14696876913655849,
97
- "grad_norm": 0.08179447168807087,
98
- "learning_rate": 0.00019868026344503306,
99
- "loss": 1.1668,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.15921616656460502,
104
- "grad_norm": 0.09018877740754977,
105
- "learning_rate": 0.00019789703509552945,
106
- "loss": 1.1517,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.17146356399265156,
111
- "grad_norm": 0.08070743453648781,
112
- "learning_rate": 0.00019693449412283435,
113
- "loss": 1.1557,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.1837109614206981,
118
- "grad_norm": 0.17006096808234306,
119
- "learning_rate": 0.00019579440356038967,
120
- "loss": 1.1265,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.19595835884874463,
125
- "grad_norm": 0.07490774235953891,
126
- "learning_rate": 0.00019447885164947088,
127
- "loss": 1.1411,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.2082057562767912,
132
- "grad_norm": 0.07560268552998486,
133
- "learning_rate": 0.00019299024801426994,
134
- "loss": 1.1346,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.22045315370483773,
139
- "grad_norm": 0.06989244321896809,
140
- "learning_rate": 0.00019133131924831917,
141
- "loss": 1.1373,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.23270055113288426,
146
- "grad_norm": 0.07297326140007601,
147
- "learning_rate": 0.00018950510392033945,
148
- "loss": 1.1262,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.2449479485609308,
153
- "grad_norm": 0.08107861966515256,
154
- "learning_rate": 0.00018751494700866087,
155
- "loss": 1.1266,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.25719534598897736,
160
- "grad_norm": 0.07525334503932822,
161
- "learning_rate": 0.0001853644937744095,
162
- "loss": 1.1337,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.2694427434170239,
167
- "grad_norm": 0.07418001338537485,
168
- "learning_rate": 0.00018305768308468293,
169
- "loss": 1.1527,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.28169014084507044,
174
- "grad_norm": 0.07966858826560685,
175
- "learning_rate": 0.00018059874019794351,
176
- "loss": 1.1275,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.29393753827311697,
181
- "grad_norm": 0.06884328126643421,
182
- "learning_rate": 0.00017799216902484466,
183
- "loss": 1.1142,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.3061849357011635,
188
- "grad_norm": 0.07638833793093423,
189
- "learning_rate": 0.00017524274387866484,
190
- "loss": 1.1489,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.31843233312921004,
195
- "grad_norm": 0.07163478075363215,
196
- "learning_rate": 0.00017235550073046028,
197
- "loss": 1.1334,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.3306797305572566,
202
- "grad_norm": 0.07584970266147063,
203
- "learning_rate": 0.00016933572798495328,
204
- "loss": 1.1394,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.3429271279853031,
209
- "grad_norm": 0.0882549132067985,
210
- "learning_rate": 0.00016618895679405165,
211
- "loss": 1.1266,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.35517452541334965,
216
- "grad_norm": 0.0738337228599522,
217
- "learning_rate": 0.00016292095092574154,
218
- "loss": 1.1356,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.3674219228413962,
223
- "grad_norm": 0.07323403324052054,
224
- "learning_rate": 0.00015953769620691022,
225
- "loss": 1.1448,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.3796693202694427,
230
- "grad_norm": 0.07258910733356848,
231
- "learning_rate": 0.0001560453895594354,
232
- "loss": 1.1255,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.39191671769748926,
237
- "grad_norm": 0.08483741713706569,
238
- "learning_rate": 0.00015245042764962417,
239
- "loss": 1.1203,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.40416411512553585,
244
- "grad_norm": 0.07393069983884801,
245
- "learning_rate": 0.00014875939517179016,
246
- "loss": 1.1305,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.4164115125535824,
251
- "grad_norm": 0.07536661950821844,
252
- "learning_rate": 0.00014497905278743083,
253
- "loss": 1.1142,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.4286589099816289,
258
- "grad_norm": 0.0774588990644394,
259
- "learning_rate": 0.00014111632474209505,
260
- "loss": 1.1014,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.44090630740967546,
265
- "grad_norm": 0.0723327812244184,
266
- "learning_rate": 0.0001371782861826226,
267
- "loss": 1.1215,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.453153704837722,
272
- "grad_norm": 0.07454342646966894,
273
- "learning_rate": 0.00013317215019798638,
274
- "loss": 1.1276,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.46540110226576853,
279
- "grad_norm": 0.07195661618627822,
280
- "learning_rate": 0.00012910525460747344,
281
- "loss": 1.1083,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.47764849969381507,
286
- "grad_norm": 0.07092309315305423,
287
- "learning_rate": 0.00012498504852040434,
288
- "loss": 1.1373,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.4898958971218616,
293
- "grad_norm": 0.07301281736550075,
294
- "learning_rate": 0.00012081907869200849,
295
- "loss": 1.1312,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.5021432945499081,
300
- "grad_norm": 0.07484347637628397,
301
- "learning_rate": 0.00011661497570044738,
302
- "loss": 1.1208,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.5143906919779547,
307
- "grad_norm": 0.0724091132876655,
308
- "learning_rate": 0.00011238043997030329,
309
- "loss": 1.1309,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.5266380894060012,
314
- "grad_norm": 0.2342422867496652,
315
- "learning_rate": 0.00010812322766813461,
316
- "loss": 1.1138,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.5388854868340478,
321
- "grad_norm": 0.07212287103404749,
322
- "learning_rate": 0.00010385113649593137,
323
- "loss": 1.1192,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.5511328842620943,
328
- "grad_norm": 0.07073394667449048,
329
- "learning_rate": 9.957199140849278e-05,
330
- "loss": 1.109,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.5633802816901409,
335
- "grad_norm": 0.06964674186116192,
336
- "learning_rate": 9.529363028088725e-05,
337
- "loss": 1.115,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 0.5756276791181874,
342
- "grad_norm": 0.07117704481271163,
343
- "learning_rate": 9.102388955224703e-05,
344
- "loss": 1.1099,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 0.5878750765462339,
349
- "grad_norm": 0.07216465095526178,
350
- "learning_rate": 8.677058987219295e-05,
351
- "loss": 1.113,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 0.6001224739742804,
356
- "grad_norm": 0.0725366248294856,
357
- "learning_rate": 8.254152177618e-05,
358
- "loss": 1.1047,
359
  "step": 245
360
  },
361
  {
362
- "epoch": 0.612369871402327,
363
- "grad_norm": 0.07979788000565378,
364
- "learning_rate": 7.83444314160013e-05,
365
- "loss": 1.1275,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 0.6246172688303735,
370
- "grad_norm": 0.07038014346187686,
371
- "learning_rate": 7.418700637158742e-05,
372
- "loss": 1.0942,
373
  "step": 255
374
  },
375
  {
376
- "epoch": 0.6368646662584201,
377
- "grad_norm": 0.07043699403227373,
378
- "learning_rate": 7.00768615700881e-05,
379
- "loss": 1.1188,
380
  "step": 260
381
  },
382
  {
383
- "epoch": 0.6491120636864667,
384
- "grad_norm": 0.07317860829882807,
385
- "learning_rate": 6.60215253380287e-05,
386
- "loss": 1.1228,
387
  "step": 265
388
  },
389
  {
390
- "epoch": 0.6613594611145132,
391
- "grad_norm": 0.0736268694865736,
392
- "learning_rate": 6.202842561208758e-05,
393
- "loss": 1.1004,
394
  "step": 270
395
  },
396
  {
397
- "epoch": 0.6736068585425597,
398
- "grad_norm": 0.0681966580195897,
399
- "learning_rate": 5.810487633375261e-05,
400
- "loss": 1.0964,
401
  "step": 275
402
  },
403
  {
404
- "epoch": 0.6858542559706062,
405
- "grad_norm": 0.06988692587157964,
406
- "learning_rate": 5.425806405277609e-05,
407
- "loss": 1.1123,
408
  "step": 280
409
  },
410
  {
411
- "epoch": 0.6981016533986528,
412
- "grad_norm": 0.06961689512931302,
413
- "learning_rate": 5.049503476396627e-05,
414
- "loss": 1.1254,
415
  "step": 285
416
  },
417
  {
418
- "epoch": 0.7103490508266993,
419
- "grad_norm": 0.06848007555420067,
420
- "learning_rate": 4.682268100142566e-05,
421
- "loss": 1.1064,
422
  "step": 290
423
  },
424
  {
425
- "epoch": 0.7225964482547459,
426
- "grad_norm": 0.06848238221490942,
427
- "learning_rate": 4.32477292138746e-05,
428
- "loss": 1.1078,
429
  "step": 295
430
  },
431
  {
432
- "epoch": 0.7348438456827924,
433
- "grad_norm": 0.06932096702672658,
434
- "learning_rate": 3.9776727444184744e-05,
435
- "loss": 1.1359,
436
  "step": 300
437
  },
438
  {
439
- "epoch": 0.747091243110839,
440
- "grad_norm": 0.06964742874998163,
441
- "learning_rate": 3.641603333568831e-05,
442
- "loss": 1.1071,
443
  "step": 305
444
  },
445
  {
446
- "epoch": 0.7593386405388854,
447
- "grad_norm": 0.07515967784857266,
448
- "learning_rate": 3.3171802487232086e-05,
449
- "loss": 1.114,
450
  "step": 310
451
  },
452
  {
453
- "epoch": 0.771586037966932,
454
- "grad_norm": 0.07140996525669459,
455
- "learning_rate": 3.0049977178305076e-05,
456
- "loss": 1.1179,
457
  "step": 315
458
  },
459
  {
460
- "epoch": 0.7838334353949785,
461
- "grad_norm": 0.06922024794802567,
462
- "learning_rate": 2.7056275484891304e-05,
463
- "loss": 1.0962,
464
  "step": 320
465
  },
466
  {
467
- "epoch": 0.7960808328230251,
468
- "grad_norm": 0.07028157088055875,
469
- "learning_rate": 2.419618080598417e-05,
470
- "loss": 1.1361,
471
  "step": 325
472
  },
473
  {
474
- "epoch": 0.8083282302510717,
475
- "grad_norm": 0.07083633675990936,
476
- "learning_rate": 2.1474931819945553e-05,
477
- "loss": 1.1025,
478
  "step": 330
479
  },
480
  {
481
- "epoch": 0.8205756276791182,
482
- "grad_norm": 0.07118501791774294,
483
- "learning_rate": 1.889751288910645e-05,
484
- "loss": 1.0959,
485
  "step": 335
486
  },
487
  {
488
- "epoch": 0.8328230251071648,
489
- "grad_norm": 0.0724941459460009,
490
- "learning_rate": 1.6468644930184095e-05,
491
- "loss": 1.0963,
492
  "step": 340
493
  },
494
  {
495
- "epoch": 0.8450704225352113,
496
- "grad_norm": 0.07065248333558355,
497
- "learning_rate": 1.4192776767238158e-05,
498
- "loss": 1.1097,
499
  "step": 345
500
  },
501
  {
502
- "epoch": 0.8573178199632578,
503
- "grad_norm": 0.06638354595318986,
504
- "learning_rate": 1.2074076983003958e-05,
505
- "loss": 1.1086,
506
  "step": 350
507
  },
508
  {
509
- "epoch": 0.8695652173913043,
510
- "grad_norm": 0.0678250769481932,
511
- "learning_rate": 1.0116426283528302e-05,
512
- "loss": 1.1164,
513
  "step": 355
514
  },
515
  {
516
- "epoch": 0.8818126148193509,
517
- "grad_norm": 0.06908465334552778,
518
- "learning_rate": 8.323410390093522e-06,
519
- "loss": 1.1219,
520
  "step": 360
521
  },
522
  {
523
- "epoch": 0.8940600122473974,
524
- "grad_norm": 0.07002593669930346,
525
- "learning_rate": 6.698313471448547e-06,
526
- "loss": 1.1057,
527
  "step": 365
528
  },
529
  {
530
- "epoch": 0.906307409675444,
531
- "grad_norm": 0.06951335625337747,
532
- "learning_rate": 5.244112128377476e-06,
533
- "loss": 1.1156,
534
  "step": 370
535
  },
536
  {
537
- "epoch": 0.9185548071034905,
538
- "grad_norm": 0.07086629076783696,
539
- "learning_rate": 3.963469941623288e-06,
540
- "loss": 1.0996,
541
  "step": 375
542
  },
543
  {
544
- "epoch": 0.9308022045315371,
545
- "grad_norm": 0.07007458613323735,
546
- "learning_rate": 2.858732593153246e-06,
547
- "loss": 1.1211,
548
  "step": 380
549
  },
550
  {
551
- "epoch": 0.9430496019595835,
552
- "grad_norm": 0.0665201643250434,
553
- "learning_rate": 1.9319235697021763e-06,
554
- "loss": 1.1165,
555
  "step": 385
556
  },
557
  {
558
- "epoch": 0.9552969993876301,
559
- "grad_norm": 0.06858672635827863,
560
- "learning_rate": 1.1847404564628185e-06,
561
- "loss": 1.0881,
562
  "step": 390
563
  },
564
  {
565
- "epoch": 0.9675443968156767,
566
- "grad_norm": 0.07063191807948407,
567
- "learning_rate": 6.185518277123214e-07,
568
- "loss": 1.1031,
569
  "step": 395
570
  },
571
  {
572
- "epoch": 0.9797917942437232,
573
- "grad_norm": 0.06843622739420911,
574
- "learning_rate": 2.343947400698432e-07,
575
- "loss": 1.1103,
576
  "step": 400
577
  },
578
  {
579
- "epoch": 0.9920391916717698,
580
- "grad_norm": 0.0681228499191145,
581
- "learning_rate": 3.2972832976918554e-08,
582
- "loss": 1.1119,
583
  "step": 405
584
  },
585
  {
586
- "epoch": 0.9993876301285977,
587
- "eval_loss": 1.037530541419983,
588
- "eval_runtime": 2.1401,
589
- "eval_samples_per_second": 3.271,
590
- "eval_steps_per_second": 0.935,
591
- "step": 408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  },
593
  {
594
- "epoch": 0.9993876301285977,
595
- "step": 408,
596
- "total_flos": 1.2948113606049792e+16,
597
- "train_loss": 1.1371603935372596,
598
- "train_runtime": 15464.5644,
599
- "train_samples_per_second": 3.378,
600
  "train_steps_per_second": 0.026
601
  }
602
  ],
603
  "logging_steps": 5,
604
- "max_steps": 408,
605
  "num_input_tokens_seen": 0,
606
  "num_train_epochs": 1,
607
  "save_steps": 100,
@@ -617,7 +680,7 @@
617
  "attributes": {}
618
  }
619
  },
620
- "total_flos": 1.2948113606049792e+16,
621
  "train_batch_size": 8,
622
  "trial_name": null,
623
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 454,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0022026431718061676,
13
+ "grad_norm": 1.0637864900736664,
14
+ "learning_rate": 4.347826086956522e-06,
15
+ "loss": 1.4366,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.011013215859030838,
20
+ "grad_norm": 1.153707883328667,
21
+ "learning_rate": 2.173913043478261e-05,
22
+ "loss": 1.4048,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.022026431718061675,
27
+ "grad_norm": 0.24033826612761172,
28
+ "learning_rate": 4.347826086956522e-05,
29
+ "loss": 1.3638,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.03303964757709251,
34
+ "grad_norm": 0.23098755386890235,
35
+ "learning_rate": 6.521739130434783e-05,
36
+ "loss": 1.3035,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.04405286343612335,
41
+ "grad_norm": 0.18769938367180908,
42
+ "learning_rate": 8.695652173913044e-05,
43
+ "loss": 1.2767,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.05506607929515418,
48
+ "grad_norm": 0.14239545432122608,
49
+ "learning_rate": 0.00010869565217391305,
50
+ "loss": 1.2196,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.06607929515418502,
55
+ "grad_norm": 0.10372099126300886,
56
+ "learning_rate": 0.00013043478260869567,
57
+ "loss": 1.21,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.07709251101321586,
62
+ "grad_norm": 0.10109724419729917,
63
+ "learning_rate": 0.00015217391304347827,
64
+ "loss": 1.2161,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.0881057268722467,
69
+ "grad_norm": 0.08677348076375273,
70
+ "learning_rate": 0.00017391304347826088,
71
+ "loss": 1.2002,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.09911894273127753,
76
+ "grad_norm": 0.07988238174290498,
77
+ "learning_rate": 0.0001956521739130435,
78
+ "loss": 1.1907,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.11013215859030837,
83
+ "grad_norm": 0.08057978560906214,
84
+ "learning_rate": 0.0001999525719713366,
85
+ "loss": 1.1572,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.1211453744493392,
90
+ "grad_norm": 0.08125586602301926,
91
+ "learning_rate": 0.0001997599727063717,
92
+ "loss": 1.1575,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.13215859030837004,
97
+ "grad_norm": 0.08174265113795184,
98
+ "learning_rate": 0.00019941952317728147,
99
+ "loss": 1.1662,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.14317180616740088,
104
+ "grad_norm": 0.09240936715995939,
105
+ "learning_rate": 0.00019893172795069144,
106
+ "loss": 1.1561,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.15418502202643172,
111
+ "grad_norm": 0.07863084458052047,
112
+ "learning_rate": 0.0001982973099683902,
113
+ "loss": 1.137,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.16519823788546256,
118
+ "grad_norm": 0.08035585420548778,
119
+ "learning_rate": 0.00019751720947588602,
120
+ "loss": 1.1417,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.1762114537444934,
125
+ "grad_norm": 0.07156903107546937,
126
+ "learning_rate": 0.00019659258262890683,
127
+ "loss": 1.156,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.18722466960352424,
132
+ "grad_norm": 0.07056162758348125,
133
+ "learning_rate": 0.000195524799779908,
134
+ "loss": 1.135,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.19823788546255505,
139
+ "grad_norm": 0.07468007657011916,
140
+ "learning_rate": 0.00019431544344712776,
141
+ "loss": 1.1285,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.2092511013215859,
146
+ "grad_norm": 0.07250524328449914,
147
+ "learning_rate": 0.00019296630596920023,
148
+ "loss": 1.1335,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.22026431718061673,
153
+ "grad_norm": 0.06955764714773345,
154
+ "learning_rate": 0.0001914793868488021,
155
+ "loss": 1.1512,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.23127753303964757,
160
+ "grad_norm": 0.07221286385490396,
161
+ "learning_rate": 0.0001898568897892697,
162
+ "loss": 1.1306,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.2422907488986784,
167
+ "grad_norm": 0.07486140156540808,
168
+ "learning_rate": 0.00018810121942857845,
169
+ "loss": 1.1342,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.2533039647577093,
174
+ "grad_norm": 0.07131722083122453,
175
+ "learning_rate": 0.00018621497777552507,
176
+ "loss": 1.1307,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.2643171806167401,
181
+ "grad_norm": 0.07348336925535924,
182
+ "learning_rate": 0.00018420096035339452,
183
+ "loss": 1.1222,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.2753303964757709,
188
+ "grad_norm": 0.07501908796003587,
189
+ "learning_rate": 0.00018206215205682683,
190
+ "loss": 1.1116,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.28634361233480177,
195
+ "grad_norm": 0.07330385148735,
196
+ "learning_rate": 0.000179801722728024,
197
+ "loss": 1.1373,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.2973568281938326,
202
+ "grad_norm": 0.073420013451975,
203
+ "learning_rate": 0.00017742302245885383,
204
+ "loss": 1.1053,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.30837004405286345,
209
+ "grad_norm": 0.07954193429333527,
210
+ "learning_rate": 0.00017492957662581295,
211
+ "loss": 1.1232,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.31938325991189426,
216
+ "grad_norm": 0.07217417992157714,
217
+ "learning_rate": 0.00017232508066520702,
218
+ "loss": 1.1237,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.3303964757709251,
223
+ "grad_norm": 0.07802555489353481,
224
+ "learning_rate": 0.0001696133945962927,
225
+ "loss": 1.1208,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.34140969162995594,
230
+ "grad_norm": 0.08970080549205696,
231
+ "learning_rate": 0.00016679853730049743,
232
+ "loss": 1.1404,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.3524229074889868,
237
+ "grad_norm": 0.07726401697227049,
238
+ "learning_rate": 0.00016388468056519612,
239
+ "loss": 1.0981,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.3634361233480176,
244
+ "grad_norm": 0.0738469257596569,
245
+ "learning_rate": 0.00016087614290087208,
246
+ "loss": 1.1245,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.3744493392070485,
251
+ "grad_norm": 0.0760961629898493,
252
+ "learning_rate": 0.00015777738314082514,
253
+ "loss": 1.1282,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.3854625550660793,
258
+ "grad_norm": 0.07461575712201869,
259
+ "learning_rate": 0.00015459299383291345,
260
+ "loss": 1.1206,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.3964757709251101,
265
+ "grad_norm": 0.07597642898156688,
266
+ "learning_rate": 0.00015132769443312207,
267
+ "loss": 1.1151,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 0.40748898678414097,
272
+ "grad_norm": 0.07220979125832422,
273
+ "learning_rate": 0.00014798632431104592,
274
+ "loss": 1.1313,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 0.4185022026431718,
279
+ "grad_norm": 0.07235062387347811,
280
+ "learning_rate": 0.00014457383557765386,
281
+ "loss": 1.126,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 0.42951541850220265,
286
+ "grad_norm": 0.07249541910324554,
287
+ "learning_rate": 0.00014109528574596301,
288
+ "loss": 1.1223,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 0.44052863436123346,
293
+ "grad_norm": 0.07276295542602365,
294
+ "learning_rate": 0.00013755583023550126,
295
+ "loss": 1.0954,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 0.45154185022026433,
300
+ "grad_norm": 0.08222792797320272,
301
+ "learning_rate": 0.00013396071473166613,
302
+ "loss": 1.1109,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 0.46255506607929514,
307
+ "grad_norm": 0.07000329815419548,
308
+ "learning_rate": 0.00013031526741130435,
309
+ "loss": 1.1122,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 0.473568281938326,
314
+ "grad_norm": 0.07273029465748866,
315
+ "learning_rate": 0.0001266248910460341,
316
+ "loss": 1.1098,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 0.4845814977973568,
321
+ "grad_norm": 0.07891968539218898,
322
+ "learning_rate": 0.0001228950549950134,
323
+ "loss": 1.1235,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 0.4955947136563877,
328
+ "grad_norm": 0.07948656131261005,
329
+ "learning_rate": 0.00011913128709902181,
330
+ "loss": 1.119,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 0.5066079295154186,
335
+ "grad_norm": 0.07426812917091022,
336
+ "learning_rate": 0.00011533916548786857,
337
+ "loss": 1.1153,
338
  "step": 230
339
  },
340
  {
341
+ "epoch": 0.5176211453744494,
342
+ "grad_norm": 0.07320813310432901,
343
+ "learning_rate": 0.00011152431031326978,
344
+ "loss": 1.1189,
345
  "step": 235
346
  },
347
  {
348
+ "epoch": 0.5286343612334802,
349
+ "grad_norm": 0.07027215565752497,
350
+ "learning_rate": 0.0001076923754194464,
351
+ "loss": 1.0921,
352
  "step": 240
353
  },
354
  {
355
+ "epoch": 0.539647577092511,
356
+ "grad_norm": 0.07197803964893518,
357
+ "learning_rate": 0.00010384903996378783,
358
+ "loss": 1.139,
359
  "step": 245
360
  },
361
  {
362
+ "epoch": 0.5506607929515418,
363
+ "grad_norm": 0.07476406384806204,
364
+ "learning_rate": 0.0001,
365
+ "loss": 1.113,
366
  "step": 250
367
  },
368
  {
369
+ "epoch": 0.5616740088105727,
370
+ "grad_norm": 0.07773914011299175,
371
+ "learning_rate": 9.615096003621221e-05,
372
+ "loss": 1.1112,
373
  "step": 255
374
  },
375
  {
376
+ "epoch": 0.5726872246696035,
377
+ "grad_norm": 0.07599058071843097,
378
+ "learning_rate": 9.230762458055363e-05,
379
+ "loss": 1.0823,
380
  "step": 260
381
  },
382
  {
383
+ "epoch": 0.5837004405286343,
384
+ "grad_norm": 0.0791178168791927,
385
+ "learning_rate": 8.847568968673026e-05,
386
+ "loss": 1.1322,
387
  "step": 265
388
  },
389
  {
390
+ "epoch": 0.5947136563876652,
391
+ "grad_norm": 0.08321320900809528,
392
+ "learning_rate": 8.466083451213144e-05,
393
+ "loss": 1.1129,
394
  "step": 270
395
  },
396
  {
397
+ "epoch": 0.6057268722466961,
398
+ "grad_norm": 0.07548913525051125,
399
+ "learning_rate": 8.086871290097821e-05,
400
+ "loss": 1.1062,
401
  "step": 275
402
  },
403
  {
404
+ "epoch": 0.6167400881057269,
405
+ "grad_norm": 0.07309802934315239,
406
+ "learning_rate": 7.710494500498662e-05,
407
+ "loss": 1.1129,
408
  "step": 280
409
  },
410
  {
411
+ "epoch": 0.6277533039647577,
412
+ "grad_norm": 0.069929912224867,
413
+ "learning_rate": 7.337510895396591e-05,
414
+ "loss": 1.1142,
415
  "step": 285
416
  },
417
  {
418
+ "epoch": 0.6387665198237885,
419
+ "grad_norm": 0.06916254050976436,
420
+ "learning_rate": 6.968473258869566e-05,
421
+ "loss": 1.1115,
422
  "step": 290
423
  },
424
  {
425
+ "epoch": 0.6497797356828194,
426
+ "grad_norm": 0.07179141392925117,
427
+ "learning_rate": 6.603928526833387e-05,
428
+ "loss": 1.1114,
429
  "step": 295
430
  },
431
  {
432
+ "epoch": 0.6607929515418502,
433
+ "grad_norm": 0.07393948815408477,
434
+ "learning_rate": 6.244416976449875e-05,
435
+ "loss": 1.0976,
436
  "step": 300
437
  },
438
  {
439
+ "epoch": 0.6718061674008811,
440
+ "grad_norm": 0.06872619691818142,
441
+ "learning_rate": 5.890471425403703e-05,
442
+ "loss": 1.1146,
443
  "step": 305
444
  },
445
  {
446
+ "epoch": 0.6828193832599119,
447
+ "grad_norm": 0.07576414521745392,
448
+ "learning_rate": 5.542616442234618e-05,
449
+ "loss": 1.1147,
450
  "step": 310
451
  },
452
  {
453
+ "epoch": 0.6938325991189427,
454
+ "grad_norm": 0.07330914446070216,
455
+ "learning_rate": 5.201367568895408e-05,
456
+ "loss": 1.0951,
457
  "step": 315
458
  },
459
  {
460
+ "epoch": 0.7048458149779736,
461
+ "grad_norm": 0.07224487325459926,
462
+ "learning_rate": 4.8672305566877964e-05,
463
+ "loss": 1.1086,
464
  "step": 320
465
  },
466
  {
467
+ "epoch": 0.7158590308370044,
468
+ "grad_norm": 0.075306080937695,
469
+ "learning_rate": 4.540700616708658e-05,
470
+ "loss": 1.1092,
471
  "step": 325
472
  },
473
  {
474
+ "epoch": 0.7268722466960352,
475
+ "grad_norm": 0.07437067674720446,
476
+ "learning_rate": 4.222261685917489e-05,
477
+ "loss": 1.0921,
478
  "step": 330
479
  },
480
  {
481
+ "epoch": 0.737885462555066,
482
+ "grad_norm": 0.0711166096176443,
483
+ "learning_rate": 3.9123857099127936e-05,
484
+ "loss": 1.1074,
485
  "step": 335
486
  },
487
  {
488
+ "epoch": 0.748898678414097,
489
+ "grad_norm": 0.07140111450874426,
490
+ "learning_rate": 3.6115319434803894e-05,
491
+ "loss": 1.1162,
492
  "step": 340
493
  },
494
  {
495
+ "epoch": 0.7599118942731278,
496
+ "grad_norm": 0.07495611302185953,
497
+ "learning_rate": 3.32014626995026e-05,
498
+ "loss": 1.1003,
499
  "step": 345
500
  },
501
  {
502
+ "epoch": 0.7709251101321586,
503
+ "grad_norm": 0.07717586378795258,
504
+ "learning_rate": 3.0386605403707346e-05,
505
+ "loss": 1.0927,
506
  "step": 350
507
  },
508
  {
509
+ "epoch": 0.7819383259911894,
510
+ "grad_norm": 0.07229554748902306,
511
+ "learning_rate": 2.7674919334793035e-05,
512
+ "loss": 1.1108,
513
  "step": 355
514
  },
515
  {
516
+ "epoch": 0.7929515418502202,
517
+ "grad_norm": 0.0768434750345143,
518
+ "learning_rate": 2.507042337418707e-05,
519
+ "loss": 1.1058,
520
  "step": 360
521
  },
522
  {
523
+ "epoch": 0.8039647577092511,
524
+ "grad_norm": 0.07347159962460627,
525
+ "learning_rate": 2.2576977541146193e-05,
526
+ "loss": 1.1115,
527
  "step": 365
528
  },
529
  {
530
+ "epoch": 0.8149779735682819,
531
+ "grad_norm": 0.0690702684713988,
532
+ "learning_rate": 2.0198277271976052e-05,
533
+ "loss": 1.1091,
534
  "step": 370
535
  },
536
  {
537
+ "epoch": 0.8259911894273128,
538
+ "grad_norm": 0.06989352121524041,
539
+ "learning_rate": 1.793784794317319e-05,
540
+ "loss": 1.0988,
541
  "step": 375
542
  },
543
  {
544
+ "epoch": 0.8370044052863436,
545
+ "grad_norm": 0.06951408734875156,
546
+ "learning_rate": 1.5799039646605486e-05,
547
+ "loss": 1.1,
548
  "step": 380
549
  },
550
  {
551
+ "epoch": 0.8480176211453745,
552
+ "grad_norm": 0.06836481993836284,
553
+ "learning_rate": 1.3785022224474942e-05,
554
+ "loss": 1.102,
555
  "step": 385
556
  },
557
  {
558
+ "epoch": 0.8590308370044053,
559
+ "grad_norm": 0.06852988441116457,
560
+ "learning_rate": 1.1898780571421552e-05,
561
+ "loss": 1.0987,
562
  "step": 390
563
  },
564
  {
565
+ "epoch": 0.8700440528634361,
566
+ "grad_norm": 0.07030606192740464,
567
+ "learning_rate": 1.0143110210730312e-05,
568
+ "loss": 1.1009,
569
  "step": 395
570
  },
571
  {
572
+ "epoch": 0.8810572687224669,
573
+ "grad_norm": 0.06918150786144664,
574
+ "learning_rate": 8.520613151197898e-06,
575
+ "loss": 1.1123,
576
  "step": 400
577
  },
578
  {
579
+ "epoch": 0.8920704845814978,
580
+ "grad_norm": 0.07235026662376734,
581
+ "learning_rate": 7.033694030799787e-06,
582
+ "loss": 1.098,
583
  "step": 405
584
  },
585
  {
586
+ "epoch": 0.9030837004405287,
587
+ "grad_norm": 0.06951148470437334,
588
+ "learning_rate": 5.684556552872256e-06,
589
+ "loss": 1.0975,
590
+ "step": 410
591
+ },
592
+ {
593
+ "epoch": 0.9140969162995595,
594
+ "grad_norm": 0.06872065759217105,
595
+ "learning_rate": 4.475200220092002e-06,
596
+ "loss": 1.0896,
597
+ "step": 415
598
+ },
599
+ {
600
+ "epoch": 0.9251101321585903,
601
+ "grad_norm": 0.06809525315060157,
602
+ "learning_rate": 3.40741737109318e-06,
603
+ "loss": 1.1158,
604
+ "step": 420
605
+ },
606
+ {
607
+ "epoch": 0.9361233480176211,
608
+ "grad_norm": 0.07195357579119646,
609
+ "learning_rate": 2.482790524113998e-06,
610
+ "loss": 1.1097,
611
+ "step": 425
612
+ },
613
+ {
614
+ "epoch": 0.947136563876652,
615
+ "grad_norm": 0.06817141756269769,
616
+ "learning_rate": 1.7026900316098215e-06,
617
+ "loss": 1.1113,
618
+ "step": 430
619
+ },
620
+ {
621
+ "epoch": 0.9581497797356828,
622
+ "grad_norm": 0.07218201005051188,
623
+ "learning_rate": 1.0682720493085607e-06,
624
+ "loss": 1.1049,
625
+ "step": 435
626
+ },
627
+ {
628
+ "epoch": 0.9691629955947136,
629
+ "grad_norm": 0.06979182488846636,
630
+ "learning_rate": 5.804768227185565e-07,
631
+ "loss": 1.1125,
632
+ "step": 440
633
+ },
634
+ {
635
+ "epoch": 0.9801762114537445,
636
+ "grad_norm": 0.06813879663952505,
637
+ "learning_rate": 2.400272936283088e-07,
638
+ "loss": 1.1002,
639
+ "step": 445
640
+ },
641
+ {
642
+ "epoch": 0.9911894273127754,
643
+ "grad_norm": 0.06852453577386937,
644
+ "learning_rate": 4.74280286634099e-08,
645
+ "loss": 1.094,
646
+ "step": 450
647
+ },
648
+ {
649
+ "epoch": 1.0,
650
+ "eval_loss": 1.0366965532302856,
651
+ "eval_runtime": 2.2297,
652
+ "eval_samples_per_second": 3.139,
653
+ "eval_steps_per_second": 0.897,
654
+ "step": 454
655
  },
656
  {
657
+ "epoch": 1.0,
658
+ "step": 454,
659
+ "total_flos": 1.4408836278386688e+16,
660
+ "train_loss": 1.131237539688396,
661
+ "train_runtime": 17315.3957,
662
+ "train_samples_per_second": 3.356,
663
  "train_steps_per_second": 0.026
664
  }
665
  ],
666
  "logging_steps": 5,
667
+ "max_steps": 454,
668
  "num_input_tokens_seen": 0,
669
  "num_train_epochs": 1,
670
  "save_steps": 100,
 
680
  "attributes": {}
681
  }
682
  },
683
+ "total_flos": 1.4408836278386688e+16,
684
  "train_batch_size": 8,
685
  "trial_name": null,
686
  "trial_params": null