sedrickkeh commited on
Commit
02d8d69
·
verified ·
1 Parent(s): 77f80d3

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_unreplicated
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_unreplicated
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6938
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_unreplicated
 
16
 
17
  # OH_DCFT_V3_wo_unreplicated
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_unreplicated dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6938
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_loss": 0.6997450590133667,
4
- "eval_runtime": 25.0703,
5
- "eval_samples_per_second": 202.909,
6
- "eval_steps_per_second": 0.399,
7
- "total_flos": 949746118164480.0,
8
- "train_loss": 0.6543745700223946,
9
- "train_runtime": 4644.4641,
10
- "train_samples_per_second": 62.421,
11
- "train_steps_per_second": 0.122
12
  }
 
1
  {
2
+ "epoch": 2.9880794701986755,
3
+ "eval_loss": 0.6938396692276001,
4
+ "eval_runtime": 101.8638,
5
+ "eval_samples_per_second": 49.939,
6
+ "eval_steps_per_second": 0.393,
7
+ "total_flos": 944302247116800.0,
8
+ "train_loss": 0.6565315867146702,
9
+ "train_runtime": 17042.4484,
10
+ "train_samples_per_second": 17.011,
11
+ "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_loss": 0.6997450590133667,
4
- "eval_runtime": 25.0703,
5
- "eval_samples_per_second": 202.909,
6
- "eval_steps_per_second": 0.399
7
  }
 
1
  {
2
+ "epoch": 2.9880794701986755,
3
+ "eval_loss": 0.6938396692276001,
4
+ "eval_runtime": 101.8638,
5
+ "eval_samples_per_second": 49.939,
6
+ "eval_steps_per_second": 0.393
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 949746118164480.0,
4
- "train_loss": 0.6543745700223946,
5
- "train_runtime": 4644.4641,
6
- "train_samples_per_second": 62.421,
7
- "train_steps_per_second": 0.122
8
  }
 
1
  {
2
+ "epoch": 2.9880794701986755,
3
+ "total_flos": 944302247116800.0,
4
+ "train_loss": 0.6565315867146702,
5
+ "train_runtime": 17042.4484,
6
+ "train_samples_per_second": 17.011,
7
+ "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -1,441 +1,441 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 567,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05291005291005291,
13
- "grad_norm": 416.71048534958487,
14
  "learning_rate": 5e-06,
15
- "loss": 1.0192,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.10582010582010581,
20
- "grad_norm": 9.82535489154965,
21
  "learning_rate": 5e-06,
22
- "loss": 0.8978,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.15873015873015872,
27
- "grad_norm": 2.4152631986997957,
28
  "learning_rate": 5e-06,
29
- "loss": 0.8522,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.21164021164021163,
34
- "grad_norm": 1.2916074320999635,
35
  "learning_rate": 5e-06,
36
- "loss": 0.8072,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.26455026455026454,
41
- "grad_norm": 1.3893758966727856,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7758,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.31746031746031744,
48
- "grad_norm": 1.2054911466815574,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7646,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.37037037037037035,
55
- "grad_norm": 0.8794856426133802,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7473,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.42328042328042326,
62
- "grad_norm": 1.2198012095245372,
63
  "learning_rate": 5e-06,
64
- "loss": 0.7266,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.47619047619047616,
69
- "grad_norm": 1.4527939889347394,
70
  "learning_rate": 5e-06,
71
- "loss": 0.7296,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.5291005291005291,
76
- "grad_norm": 1.0469501290398182,
77
  "learning_rate": 5e-06,
78
- "loss": 0.7153,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.582010582010582,
83
- "grad_norm": 1.3687805003393385,
84
  "learning_rate": 5e-06,
85
- "loss": 0.7146,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.6349206349206349,
90
- "grad_norm": 0.7597394338069358,
91
  "learning_rate": 5e-06,
92
- "loss": 0.7149,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.6878306878306878,
97
- "grad_norm": 0.7338187068599691,
98
  "learning_rate": 5e-06,
99
- "loss": 0.7077,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.7407407407407407,
104
- "grad_norm": 0.7105896232322656,
105
  "learning_rate": 5e-06,
106
- "loss": 0.7045,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.7936507936507936,
111
- "grad_norm": 0.7782617723012114,
112
  "learning_rate": 5e-06,
113
- "loss": 0.7008,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.8465608465608465,
118
- "grad_norm": 0.7692589442011014,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6997,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.8994708994708994,
125
- "grad_norm": 1.2419889829411472,
126
  "learning_rate": 5e-06,
127
- "loss": 0.707,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.9523809523809523,
132
- "grad_norm": 0.6796905528838658,
133
  "learning_rate": 5e-06,
134
- "loss": 0.7062,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 1.0,
139
- "eval_loss": 0.7010859251022339,
140
- "eval_runtime": 26.0186,
141
- "eval_samples_per_second": 195.514,
142
- "eval_steps_per_second": 0.384,
143
- "step": 189
144
  },
145
  {
146
- "epoch": 1.0052910052910053,
147
- "grad_norm": 1.0978250374797298,
148
  "learning_rate": 5e-06,
149
- "loss": 0.6881,
150
  "step": 190
151
  },
152
  {
153
- "epoch": 1.0582010582010581,
154
- "grad_norm": 0.8714976613775269,
155
  "learning_rate": 5e-06,
156
- "loss": 0.6345,
157
  "step": 200
158
  },
159
  {
160
- "epoch": 1.1111111111111112,
161
- "grad_norm": 0.7509137617425738,
162
  "learning_rate": 5e-06,
163
- "loss": 0.6311,
164
  "step": 210
165
  },
166
  {
167
- "epoch": 1.164021164021164,
168
- "grad_norm": 0.5974905751964193,
169
  "learning_rate": 5e-06,
170
- "loss": 0.6231,
171
  "step": 220
172
  },
173
  {
174
- "epoch": 1.216931216931217,
175
- "grad_norm": 0.8717270870439439,
176
  "learning_rate": 5e-06,
177
- "loss": 0.6467,
178
  "step": 230
179
  },
180
  {
181
- "epoch": 1.2698412698412698,
182
- "grad_norm": 0.6810610972836734,
183
  "learning_rate": 5e-06,
184
- "loss": 0.6451,
185
  "step": 240
186
  },
187
  {
188
- "epoch": 1.3227513227513228,
189
- "grad_norm": 0.7378737080285518,
190
  "learning_rate": 5e-06,
191
- "loss": 0.6307,
192
  "step": 250
193
  },
194
  {
195
- "epoch": 1.3756613756613756,
196
- "grad_norm": 0.6950294694450657,
197
  "learning_rate": 5e-06,
198
- "loss": 0.6339,
199
  "step": 260
200
  },
201
  {
202
- "epoch": 1.4285714285714286,
203
- "grad_norm": 0.6214872817490306,
204
  "learning_rate": 5e-06,
205
- "loss": 0.6344,
206
  "step": 270
207
  },
208
  {
209
- "epoch": 1.4814814814814814,
210
- "grad_norm": 0.7166396689431728,
211
  "learning_rate": 5e-06,
212
- "loss": 0.6316,
213
  "step": 280
214
  },
215
  {
216
- "epoch": 1.5343915343915344,
217
- "grad_norm": 0.748339470872035,
218
  "learning_rate": 5e-06,
219
- "loss": 0.64,
220
  "step": 290
221
  },
222
  {
223
- "epoch": 1.5873015873015874,
224
- "grad_norm": 0.7727641573683094,
225
  "learning_rate": 5e-06,
226
- "loss": 0.6312,
227
  "step": 300
228
  },
229
  {
230
- "epoch": 1.6402116402116402,
231
- "grad_norm": 0.732172271994369,
232
  "learning_rate": 5e-06,
233
- "loss": 0.6297,
234
  "step": 310
235
  },
236
  {
237
- "epoch": 1.693121693121693,
238
- "grad_norm": 0.6023499271138227,
239
  "learning_rate": 5e-06,
240
- "loss": 0.6326,
241
  "step": 320
242
  },
243
  {
244
- "epoch": 1.746031746031746,
245
- "grad_norm": 0.7149841589858245,
246
  "learning_rate": 5e-06,
247
- "loss": 0.6343,
248
  "step": 330
249
  },
250
  {
251
- "epoch": 1.798941798941799,
252
- "grad_norm": 1.004464391677582,
253
  "learning_rate": 5e-06,
254
- "loss": 0.6364,
255
  "step": 340
256
  },
257
  {
258
- "epoch": 1.8518518518518519,
259
- "grad_norm": 0.5867433151491478,
260
  "learning_rate": 5e-06,
261
- "loss": 0.6349,
262
  "step": 350
263
  },
264
  {
265
- "epoch": 1.9047619047619047,
266
- "grad_norm": 0.7255438168551538,
267
  "learning_rate": 5e-06,
268
- "loss": 0.6379,
269
  "step": 360
270
  },
271
  {
272
- "epoch": 1.9576719576719577,
273
- "grad_norm": 0.7221123642587326,
274
  "learning_rate": 5e-06,
275
- "loss": 0.6303,
276
  "step": 370
277
  },
278
  {
279
- "epoch": 2.0,
280
- "eval_loss": 0.690298855304718,
281
- "eval_runtime": 25.7372,
282
- "eval_samples_per_second": 197.652,
283
- "eval_steps_per_second": 0.389,
284
- "step": 378
285
  },
286
  {
287
- "epoch": 2.0105820105820107,
288
- "grad_norm": 0.8886608412212685,
289
  "learning_rate": 5e-06,
290
- "loss": 0.6187,
291
  "step": 380
292
  },
293
  {
294
- "epoch": 2.0634920634920633,
295
- "grad_norm": 0.8003078120118251,
296
  "learning_rate": 5e-06,
297
- "loss": 0.5681,
298
  "step": 390
299
  },
300
  {
301
- "epoch": 2.1164021164021163,
302
- "grad_norm": 0.8666463121587671,
303
  "learning_rate": 5e-06,
304
- "loss": 0.5579,
305
  "step": 400
306
  },
307
  {
308
- "epoch": 2.1693121693121693,
309
- "grad_norm": 0.8539706003079143,
310
  "learning_rate": 5e-06,
311
- "loss": 0.5728,
312
  "step": 410
313
  },
314
  {
315
- "epoch": 2.2222222222222223,
316
- "grad_norm": 1.1616873079519627,
317
  "learning_rate": 5e-06,
318
- "loss": 0.5651,
319
  "step": 420
320
  },
321
  {
322
- "epoch": 2.2751322751322753,
323
- "grad_norm": 0.7480866173033474,
324
  "learning_rate": 5e-06,
325
- "loss": 0.5661,
326
  "step": 430
327
  },
328
  {
329
- "epoch": 2.328042328042328,
330
- "grad_norm": 0.7844596384753618,
331
  "learning_rate": 5e-06,
332
- "loss": 0.5686,
333
  "step": 440
334
  },
335
  {
336
- "epoch": 2.380952380952381,
337
- "grad_norm": 0.7440590331433045,
338
  "learning_rate": 5e-06,
339
- "loss": 0.5659,
340
  "step": 450
341
  },
342
  {
343
- "epoch": 2.433862433862434,
344
- "grad_norm": 0.659290068384902,
345
  "learning_rate": 5e-06,
346
- "loss": 0.5632,
347
  "step": 460
348
  },
349
  {
350
- "epoch": 2.4867724867724865,
351
- "grad_norm": 0.67543738389126,
352
  "learning_rate": 5e-06,
353
- "loss": 0.5663,
354
  "step": 470
355
  },
356
  {
357
- "epoch": 2.5396825396825395,
358
- "grad_norm": 0.7004953960176246,
359
  "learning_rate": 5e-06,
360
- "loss": 0.5645,
361
  "step": 480
362
  },
363
  {
364
- "epoch": 2.5925925925925926,
365
- "grad_norm": 0.6484461516755643,
366
  "learning_rate": 5e-06,
367
- "loss": 0.5744,
368
  "step": 490
369
  },
370
  {
371
- "epoch": 2.6455026455026456,
372
- "grad_norm": 0.6437533712181355,
373
  "learning_rate": 5e-06,
374
- "loss": 0.5736,
375
  "step": 500
376
  },
377
  {
378
- "epoch": 2.6984126984126986,
379
- "grad_norm": 1.0308809556967322,
380
  "learning_rate": 5e-06,
381
- "loss": 0.5764,
382
  "step": 510
383
  },
384
  {
385
- "epoch": 2.751322751322751,
386
- "grad_norm": 0.8500517655693743,
387
  "learning_rate": 5e-06,
388
- "loss": 0.5706,
389
  "step": 520
390
  },
391
  {
392
- "epoch": 2.804232804232804,
393
- "grad_norm": 0.6843226523187806,
394
  "learning_rate": 5e-06,
395
- "loss": 0.5776,
396
  "step": 530
397
  },
398
  {
399
- "epoch": 2.857142857142857,
400
- "grad_norm": 0.8314054966778545,
401
  "learning_rate": 5e-06,
402
- "loss": 0.5776,
403
  "step": 540
404
  },
405
  {
406
- "epoch": 2.91005291005291,
407
- "grad_norm": 0.6641365059618296,
408
  "learning_rate": 5e-06,
409
- "loss": 0.5811,
410
  "step": 550
411
  },
412
  {
413
- "epoch": 2.962962962962963,
414
- "grad_norm": 0.6653280911506885,
415
  "learning_rate": 5e-06,
416
- "loss": 0.5694,
417
  "step": 560
418
  },
419
  {
420
- "epoch": 3.0,
421
- "eval_loss": 0.6997450590133667,
422
- "eval_runtime": 26.1485,
423
- "eval_samples_per_second": 194.543,
424
- "eval_steps_per_second": 0.382,
425
- "step": 567
426
  },
427
  {
428
- "epoch": 3.0,
429
- "step": 567,
430
- "total_flos": 949746118164480.0,
431
- "train_loss": 0.6543745700223946,
432
- "train_runtime": 4644.4641,
433
- "train_samples_per_second": 62.421,
434
- "train_steps_per_second": 0.122
435
  }
436
  ],
437
  "logging_steps": 10,
438
- "max_steps": 567,
439
  "num_input_tokens_seen": 0,
440
  "num_train_epochs": 3,
441
  "save_steps": 500,
@@ -451,7 +451,7 @@
451
  "attributes": {}
452
  }
453
  },
454
- "total_flos": 949746118164480.0,
455
  "train_batch_size": 8,
456
  "trial_name": null,
457
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9880794701986755,
5
  "eval_steps": 500,
6
+ "global_step": 564,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.052980132450331126,
13
+ "grad_norm": 2.3983146952234486,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.9851,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.10596026490066225,
20
+ "grad_norm": 1.7246649174554667,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.8319,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.15894039735099338,
27
+ "grad_norm": 3.601038796091522,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.8075,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.2119205298013245,
34
+ "grad_norm": 1.2059052889786372,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7915,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.26490066225165565,
41
+ "grad_norm": 1.3608839677655582,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7709,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.31788079470198677,
48
+ "grad_norm": 1.2434441624370192,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7621,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.3708609271523179,
55
+ "grad_norm": 0.9441777855534653,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7454,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.423841059602649,
62
+ "grad_norm": 1.5125978662591961,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.7248,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.4768211920529801,
69
+ "grad_norm": 0.7471461569802452,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.7275,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.5298013245033113,
76
+ "grad_norm": 0.645516397675585,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.7123,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.5827814569536424,
83
+ "grad_norm": 0.7251445711578004,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.7117,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.6357615894039735,
90
+ "grad_norm": 0.651327566584479,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.7119,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.6887417218543046,
97
+ "grad_norm": 0.6140018870668793,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.7053,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.7417218543046358,
104
+ "grad_norm": 0.5388085038750972,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.7022,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.7947019867549668,
111
+ "grad_norm": 1.010650981679106,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6987,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.847682119205298,
118
+ "grad_norm": 1.1037782052291758,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6976,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.9006622516556292,
125
+ "grad_norm": 0.666699690620748,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.705,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.9536423841059603,
132
+ "grad_norm": 0.5794869194974834,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.7042,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.9960264900662251,
139
+ "eval_loss": 0.6996881365776062,
140
+ "eval_runtime": 101.9424,
141
+ "eval_samples_per_second": 49.901,
142
+ "eval_steps_per_second": 0.392,
143
+ "step": 188
144
  },
145
  {
146
+ "epoch": 1.0066225165562914,
147
+ "grad_norm": 0.8762129194412334,
148
  "learning_rate": 5e-06,
149
+ "loss": 0.6861,
150
  "step": 190
151
  },
152
  {
153
+ "epoch": 1.0596026490066226,
154
+ "grad_norm": 0.7641648019922738,
155
  "learning_rate": 5e-06,
156
+ "loss": 0.6399,
157
  "step": 200
158
  },
159
  {
160
+ "epoch": 1.1125827814569536,
161
+ "grad_norm": 0.6026445432992825,
162
  "learning_rate": 5e-06,
163
+ "loss": 0.6342,
164
  "step": 210
165
  },
166
  {
167
+ "epoch": 1.1655629139072847,
168
+ "grad_norm": 0.5607709134599749,
169
  "learning_rate": 5e-06,
170
+ "loss": 0.6285,
171
  "step": 220
172
  },
173
  {
174
+ "epoch": 1.218543046357616,
175
+ "grad_norm": 0.723598523167553,
176
  "learning_rate": 5e-06,
177
+ "loss": 0.654,
178
  "step": 230
179
  },
180
  {
181
+ "epoch": 1.271523178807947,
182
+ "grad_norm": 0.6634015008522252,
183
  "learning_rate": 5e-06,
184
+ "loss": 0.6474,
185
  "step": 240
186
  },
187
  {
188
+ "epoch": 1.3245033112582782,
189
+ "grad_norm": 0.5676178378824602,
190
  "learning_rate": 5e-06,
191
+ "loss": 0.6356,
192
  "step": 250
193
  },
194
  {
195
+ "epoch": 1.3774834437086092,
196
+ "grad_norm": 0.6219906931731467,
197
  "learning_rate": 5e-06,
198
+ "loss": 0.6396,
199
  "step": 260
200
  },
201
  {
202
+ "epoch": 1.4304635761589404,
203
+ "grad_norm": 0.5539002206307158,
204
  "learning_rate": 5e-06,
205
+ "loss": 0.6395,
206
  "step": 270
207
  },
208
  {
209
+ "epoch": 1.4834437086092715,
210
+ "grad_norm": 0.6706880554061717,
211
  "learning_rate": 5e-06,
212
+ "loss": 0.6364,
213
  "step": 280
214
  },
215
  {
216
+ "epoch": 1.5364238410596025,
217
+ "grad_norm": 0.6250744115575335,
218
  "learning_rate": 5e-06,
219
+ "loss": 0.6455,
220
  "step": 290
221
  },
222
  {
223
+ "epoch": 1.589403973509934,
224
+ "grad_norm": 0.5666575820633527,
225
  "learning_rate": 5e-06,
226
+ "loss": 0.6352,
227
  "step": 300
228
  },
229
  {
230
+ "epoch": 1.6423841059602649,
231
+ "grad_norm": 0.8049891928557037,
232
  "learning_rate": 5e-06,
233
+ "loss": 0.634,
234
  "step": 310
235
  },
236
  {
237
+ "epoch": 1.695364238410596,
238
+ "grad_norm": 0.8098028256502842,
239
  "learning_rate": 5e-06,
240
+ "loss": 0.6379,
241
  "step": 320
242
  },
243
  {
244
+ "epoch": 1.7483443708609272,
245
+ "grad_norm": 0.6314929024368203,
246
  "learning_rate": 5e-06,
247
+ "loss": 0.6394,
248
  "step": 330
249
  },
250
  {
251
+ "epoch": 1.8013245033112582,
252
+ "grad_norm": 0.824620474103318,
253
  "learning_rate": 5e-06,
254
+ "loss": 0.6414,
255
  "step": 340
256
  },
257
  {
258
+ "epoch": 1.8543046357615895,
259
+ "grad_norm": 0.5854556799760776,
260
  "learning_rate": 5e-06,
261
+ "loss": 0.6393,
262
  "step": 350
263
  },
264
  {
265
+ "epoch": 1.9072847682119205,
266
+ "grad_norm": 0.6825161397864904,
267
  "learning_rate": 5e-06,
268
+ "loss": 0.6408,
269
  "step": 360
270
  },
271
  {
272
+ "epoch": 1.9602649006622517,
273
+ "grad_norm": 0.5897191051228083,
274
  "learning_rate": 5e-06,
275
+ "loss": 0.6362,
276
  "step": 370
277
  },
278
  {
279
+ "epoch": 1.9973509933774833,
280
+ "eval_loss": 0.6881988644599915,
281
+ "eval_runtime": 101.7242,
282
+ "eval_samples_per_second": 50.008,
283
+ "eval_steps_per_second": 0.393,
284
+ "step": 377
285
  },
286
  {
287
+ "epoch": 2.013245033112583,
288
+ "grad_norm": 0.8098259238713678,
289
  "learning_rate": 5e-06,
290
+ "loss": 0.6249,
291
  "step": 380
292
  },
293
  {
294
+ "epoch": 2.066225165562914,
295
+ "grad_norm": 0.8384706128552907,
296
  "learning_rate": 5e-06,
297
+ "loss": 0.5796,
298
  "step": 390
299
  },
300
  {
301
+ "epoch": 2.119205298013245,
302
+ "grad_norm": 0.7877590869928718,
303
  "learning_rate": 5e-06,
304
+ "loss": 0.5721,
305
  "step": 400
306
  },
307
  {
308
+ "epoch": 2.172185430463576,
309
+ "grad_norm": 0.8214423131053483,
310
  "learning_rate": 5e-06,
311
+ "loss": 0.5881,
312
  "step": 410
313
  },
314
  {
315
+ "epoch": 2.225165562913907,
316
+ "grad_norm": 0.708950143379715,
317
  "learning_rate": 5e-06,
318
+ "loss": 0.5788,
319
  "step": 420
320
  },
321
  {
322
+ "epoch": 2.2781456953642385,
323
+ "grad_norm": 0.6491889315422662,
324
  "learning_rate": 5e-06,
325
+ "loss": 0.5778,
326
  "step": 430
327
  },
328
  {
329
+ "epoch": 2.3311258278145695,
330
+ "grad_norm": 0.6858462236619034,
331
  "learning_rate": 5e-06,
332
+ "loss": 0.5821,
333
  "step": 440
334
  },
335
  {
336
+ "epoch": 2.384105960264901,
337
+ "grad_norm": 0.780640823583864,
338
  "learning_rate": 5e-06,
339
+ "loss": 0.5768,
340
  "step": 450
341
  },
342
  {
343
+ "epoch": 2.437086092715232,
344
+ "grad_norm": 0.7173053514977337,
345
  "learning_rate": 5e-06,
346
+ "loss": 0.5759,
347
  "step": 460
348
  },
349
  {
350
+ "epoch": 2.4900662251655628,
351
+ "grad_norm": 0.7004632879605499,
352
  "learning_rate": 5e-06,
353
+ "loss": 0.5787,
354
  "step": 470
355
  },
356
  {
357
+ "epoch": 2.543046357615894,
358
+ "grad_norm": 0.7028579263335615,
359
  "learning_rate": 5e-06,
360
+ "loss": 0.579,
361
  "step": 480
362
  },
363
  {
364
+ "epoch": 2.596026490066225,
365
+ "grad_norm": 0.9012109929919548,
366
  "learning_rate": 5e-06,
367
+ "loss": 0.5848,
368
  "step": 490
369
  },
370
  {
371
+ "epoch": 2.6490066225165565,
372
+ "grad_norm": 0.6237112161014274,
373
  "learning_rate": 5e-06,
374
+ "loss": 0.584,
375
  "step": 500
376
  },
377
  {
378
+ "epoch": 2.7019867549668874,
379
+ "grad_norm": 0.6803732464125802,
380
  "learning_rate": 5e-06,
381
+ "loss": 0.5918,
382
  "step": 510
383
  },
384
  {
385
+ "epoch": 2.7549668874172184,
386
+ "grad_norm": 0.7496234836165662,
387
  "learning_rate": 5e-06,
388
+ "loss": 0.5797,
389
  "step": 520
390
  },
391
  {
392
+ "epoch": 2.80794701986755,
393
+ "grad_norm": 0.6761315878843943,
394
  "learning_rate": 5e-06,
395
+ "loss": 0.5921,
396
  "step": 530
397
  },
398
  {
399
+ "epoch": 2.8609271523178808,
400
+ "grad_norm": 0.6002390896713952,
401
  "learning_rate": 5e-06,
402
+ "loss": 0.5887,
403
  "step": 540
404
  },
405
  {
406
+ "epoch": 2.9139072847682117,
407
+ "grad_norm": 0.6064169883870584,
408
  "learning_rate": 5e-06,
409
+ "loss": 0.5919,
410
  "step": 550
411
  },
412
  {
413
+ "epoch": 2.966887417218543,
414
+ "grad_norm": 0.5890652422466117,
415
  "learning_rate": 5e-06,
416
+ "loss": 0.5826,
417
  "step": 560
418
  },
419
  {
420
+ "epoch": 2.9880794701986755,
421
+ "eval_loss": 0.6938396692276001,
422
+ "eval_runtime": 102.1596,
423
+ "eval_samples_per_second": 49.795,
424
+ "eval_steps_per_second": 0.392,
425
+ "step": 564
426
  },
427
  {
428
+ "epoch": 2.9880794701986755,
429
+ "step": 564,
430
+ "total_flos": 944302247116800.0,
431
+ "train_loss": 0.6565315867146702,
432
+ "train_runtime": 17042.4484,
433
+ "train_samples_per_second": 17.011,
434
+ "train_steps_per_second": 0.033
435
  }
436
  ],
437
  "logging_steps": 10,
438
+ "max_steps": 564,
439
  "num_input_tokens_seen": 0,
440
  "num_train_epochs": 3,
441
  "save_steps": 500,
 
451
  "attributes": {}
452
  }
453
  },
454
+ "total_flos": 944302247116800.0,
455
  "train_batch_size": 8,
456
  "trial_name": null,
457
  "trial_params": null
training_eval_loss.png CHANGED
training_loss.png CHANGED