Ba2han commited on
Commit
10dc0a5
·
verified ·
1 Parent(s): e3b6b8c

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61b4ce55f856a67bfdbd003ebe9b33428ea067be73223fd9c3bf1c9940e96dfe
3
  size 1008303016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7f3da2bd850f33f7298372ff869be8f81c2b3405227fe6c9bd7c6f2f71a131
3
  size 1008303016
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d82d52d99bf15dccd952a3b4f736ae5075d180cf8f005d3c384238593828d1df
3
  size 1086712487
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc159d75d63a73fbd137e19fda0856ce4e3720ab62cac1e9b38b17fedbdf4bf8
3
  size 1086712487
last-checkpoint/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2140468227424749,
6
  "eval_steps": 50,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
@@ -10,368 +10,368 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.004280936454849498,
14
- "grad_norm": 2.3125,
15
  "learning_rate": 0.0,
16
- "loss": 10.99751091003418,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.008561872909698997,
21
- "grad_norm": 2.21875,
22
  "learning_rate": 0.0125,
23
- "loss": 10.999887466430664,
24
  "step": 2
25
  },
26
  {
27
- "epoch": 0.012842809364548496,
28
- "grad_norm": 1.875,
29
  "learning_rate": 0.025,
30
- "loss": 10.661027908325195,
31
  "step": 3
32
  },
33
  {
34
- "epoch": 0.017123745819397993,
35
- "grad_norm": 1.453125,
36
  "learning_rate": 0.02499842659592344,
37
- "loss": 10.258359909057617,
38
  "step": 4
39
  },
40
  {
41
- "epoch": 0.02140468227424749,
42
- "grad_norm": 1.6484375,
43
  "learning_rate": 0.024993706779789817,
44
- "loss": 10.03167724609375,
45
  "step": 5
46
  },
47
  {
48
- "epoch": 0.02568561872909699,
49
- "grad_norm": 2.65625,
50
  "learning_rate": 0.0249858417397876,
51
- "loss": 10.06635856628418,
52
  "step": 6
53
  },
54
  {
55
- "epoch": 0.02996655518394649,
56
- "grad_norm": 1.3515625,
57
  "learning_rate": 0.024974833455898557,
58
- "loss": 9.684752464294434,
59
  "step": 7
60
  },
61
  {
62
- "epoch": 0.034247491638795986,
63
- "grad_norm": 3.03125,
64
  "learning_rate": 0.024960684699399282,
65
- "loss": 9.619985580444336,
66
  "step": 8
67
  },
68
  {
69
- "epoch": 0.038528428093645484,
70
- "grad_norm": 1.2109375,
71
  "learning_rate": 0.024943399032163558,
72
- "loss": 9.322598457336426,
73
  "step": 9
74
  },
75
  {
76
- "epoch": 0.04280936454849498,
77
- "grad_norm": 1.203125,
78
  "learning_rate": 0.024922980805765677,
79
- "loss": 9.167491912841797,
80
  "step": 10
81
  },
82
  {
83
- "epoch": 0.04709030100334448,
84
- "grad_norm": 1.1171875,
85
  "learning_rate": 0.024899435160384942,
86
- "loss": 8.97598934173584,
87
  "step": 11
88
  },
89
  {
90
- "epoch": 0.05137123745819398,
91
- "grad_norm": 1.09375,
92
  "learning_rate": 0.02487276802351166,
93
- "loss": 8.841875076293945,
94
  "step": 12
95
  },
96
  {
97
- "epoch": 0.05565217391304348,
98
- "grad_norm": 0.8125,
99
  "learning_rate": 0.02484298610845493,
100
- "loss": 8.693690299987793,
101
  "step": 13
102
  },
103
  {
104
- "epoch": 0.05993311036789298,
105
- "grad_norm": 0.80859375,
106
  "learning_rate": 0.024810096912652603,
107
- "loss": 8.540060997009277,
108
  "step": 14
109
  },
110
  {
111
- "epoch": 0.06421404682274247,
112
- "grad_norm": 0.62109375,
113
  "learning_rate": 0.024774108715783835,
114
- "loss": 8.427875518798828,
115
  "step": 15
116
  },
117
  {
118
- "epoch": 0.06849498327759197,
119
- "grad_norm": 0.703125,
120
  "learning_rate": 0.02473503057768474,
121
- "loss": 8.279977798461914,
122
  "step": 16
123
  },
124
  {
125
- "epoch": 0.07277591973244148,
126
- "grad_norm": 0.6640625,
127
  "learning_rate": 0.02469287233606759,
128
- "loss": 8.238062858581543,
129
  "step": 17
130
  },
131
  {
132
- "epoch": 0.07705685618729097,
133
- "grad_norm": 0.60546875,
134
  "learning_rate": 0.024647644604044273,
135
- "loss": 8.140103340148926,
136
  "step": 18
137
  },
138
  {
139
- "epoch": 0.08133779264214047,
140
- "grad_norm": 0.6171875,
141
  "learning_rate": 0.024599358767454456,
142
- "loss": 8.044753074645996,
143
  "step": 19
144
  },
145
  {
146
- "epoch": 0.08561872909698996,
147
- "grad_norm": 0.515625,
148
  "learning_rate": 0.024548026981999278,
149
- "loss": 7.991354465484619,
150
  "step": 20
151
  },
152
  {
153
- "epoch": 0.08989966555183947,
154
- "grad_norm": 0.57421875,
155
  "learning_rate": 0.02449366217018122,
156
- "loss": 7.921579837799072,
157
  "step": 21
158
  },
159
  {
160
- "epoch": 0.09418060200668896,
161
- "grad_norm": 0.65234375,
162
  "learning_rate": 0.024436278018050924,
163
- "loss": 7.871639728546143,
164
  "step": 22
165
  },
166
  {
167
- "epoch": 0.09846153846153846,
168
- "grad_norm": 0.50390625,
169
  "learning_rate": 0.02437588897176182,
170
- "loss": 7.7901129722595215,
171
  "step": 23
172
  },
173
  {
174
- "epoch": 0.10274247491638797,
175
- "grad_norm": 0.458984375,
176
  "learning_rate": 0.024312510233933354,
177
- "loss": 7.722414016723633,
178
  "step": 24
179
  },
180
  {
181
- "epoch": 0.10702341137123746,
182
- "grad_norm": 0.671875,
183
  "learning_rate": 0.024246157759823857,
184
- "loss": 7.714761257171631,
185
  "step": 25
186
  },
187
  {
188
- "epoch": 0.11130434782608696,
189
- "grad_norm": 0.73828125,
190
  "learning_rate": 0.024176848253313836,
191
- "loss": 7.689679145812988,
192
  "step": 26
193
  },
194
  {
195
- "epoch": 0.11558528428093645,
196
- "grad_norm": 0.466796875,
197
  "learning_rate": 0.02410459916270091,
198
- "loss": 7.656161308288574,
199
  "step": 27
200
  },
201
  {
202
- "epoch": 0.11986622073578596,
203
  "grad_norm": 0.373046875,
204
  "learning_rate": 0.02402942867630727,
205
- "loss": 7.593247890472412,
206
  "step": 28
207
  },
208
  {
209
- "epoch": 0.12414715719063545,
210
- "grad_norm": 0.365234375,
211
  "learning_rate": 0.02395135571790087,
212
- "loss": 7.558600902557373,
213
  "step": 29
214
  },
215
  {
216
- "epoch": 0.12842809364548494,
217
- "grad_norm": 0.392578125,
218
  "learning_rate": 0.02387039994193148,
219
- "loss": 7.517961502075195,
220
  "step": 30
221
  },
222
  {
223
- "epoch": 0.13270903010033444,
224
- "grad_norm": 0.48828125,
225
  "learning_rate": 0.023786581728582768,
226
- "loss": 7.4646525382995605,
227
  "step": 31
228
  },
229
  {
230
- "epoch": 0.13698996655518395,
231
- "grad_norm": 0.41015625,
232
  "learning_rate": 0.023699922178641697,
233
- "loss": 7.4094624519348145,
234
  "step": 32
235
  },
236
  {
237
- "epoch": 0.14127090301003345,
238
- "grad_norm": 0.376953125,
239
  "learning_rate": 0.023610443108186545,
240
- "loss": 7.402709484100342,
241
  "step": 33
242
  },
243
  {
244
- "epoch": 0.14555183946488295,
245
- "grad_norm": 0.361328125,
246
  "learning_rate": 0.023518167043094777,
247
- "loss": 7.354226112365723,
248
  "step": 34
249
  },
250
  {
251
- "epoch": 0.14983277591973243,
252
- "grad_norm": 0.38671875,
253
  "learning_rate": 0.023423117213372313,
254
- "loss": 7.302793502807617,
255
  "step": 35
256
  },
257
  {
258
- "epoch": 0.15411371237458193,
259
- "grad_norm": 0.484375,
260
  "learning_rate": 0.023325317547305487,
261
- "loss": 7.298144817352295,
262
  "step": 36
263
  },
264
  {
265
- "epoch": 0.15839464882943144,
266
- "grad_norm": 0.56640625,
267
  "learning_rate": 0.023224792665437213,
268
- "loss": 7.267189979553223,
269
  "step": 37
270
  },
271
  {
272
- "epoch": 0.16267558528428094,
273
- "grad_norm": 0.4140625,
274
  "learning_rate": 0.023121567874368934,
275
- "loss": 7.22215461730957,
276
  "step": 38
277
  },
278
  {
279
- "epoch": 0.16695652173913045,
280
- "grad_norm": 0.47265625,
281
  "learning_rate": 0.023015669160389766,
282
- "loss": 7.1571245193481445,
283
  "step": 39
284
  },
285
  {
286
- "epoch": 0.17123745819397992,
287
- "grad_norm": 0.466796875,
288
  "learning_rate": 0.02290712318293464,
289
- "loss": 7.14966344833374,
290
  "step": 40
291
  },
292
  {
293
- "epoch": 0.17551839464882943,
294
- "grad_norm": 0.486328125,
295
  "learning_rate": 0.02279595726787291,
296
- "loss": 7.105873107910156,
297
  "step": 41
298
  },
299
  {
300
- "epoch": 0.17979933110367893,
301
- "grad_norm": 0.46875,
302
  "learning_rate": 0.022682199400629197,
303
- "loss": 7.067068099975586,
304
  "step": 42
305
  },
306
  {
307
- "epoch": 0.18408026755852844,
308
- "grad_norm": 0.49609375,
309
  "learning_rate": 0.022565878219138233,
310
- "loss": 7.027277946472168,
311
  "step": 43
312
  },
313
  {
314
- "epoch": 0.18836120401337791,
315
- "grad_norm": 0.390625,
316
  "learning_rate": 0.022447023006635404,
317
- "loss": 6.947190761566162,
318
  "step": 44
319
  },
320
  {
321
- "epoch": 0.19264214046822742,
322
- "grad_norm": 0.40234375,
323
  "learning_rate": 0.022325663684284847,
324
- "loss": 6.984321594238281,
325
  "step": 45
326
  },
327
  {
328
- "epoch": 0.19692307692307692,
329
- "grad_norm": 0.40234375,
330
  "learning_rate": 0.02220183080364696,
331
- "loss": 6.949177265167236,
332
  "step": 46
333
  },
334
  {
335
- "epoch": 0.20120401337792643,
336
- "grad_norm": 0.353515625,
337
  "learning_rate": 0.022075555538987227,
338
- "loss": 6.896018028259277,
339
  "step": 47
340
  },
341
  {
342
- "epoch": 0.20548494983277593,
343
- "grad_norm": 0.41015625,
344
  "learning_rate": 0.02194686967942823,
345
- "loss": 6.808597564697266,
346
  "step": 48
347
  },
348
  {
349
- "epoch": 0.2097658862876254,
350
- "grad_norm": 0.408203125,
351
  "learning_rate": 0.021815805620946937,
352
- "loss": 6.824460983276367,
353
  "step": 49
354
  },
355
  {
356
- "epoch": 0.2140468227424749,
357
- "grad_norm": 0.4453125,
358
  "learning_rate": 0.021682396358219166,
359
- "loss": 6.777248382568359,
360
  "step": 50
361
  },
362
  {
363
- "epoch": 0.2140468227424749,
364
- "eval_loss": 6.7472333908081055,
365
- "eval_runtime": 51.7313,
366
- "eval_samples_per_second": 11.811,
367
- "eval_steps_per_second": 2.958,
368
  "step": 50
369
  }
370
  ],
371
  "logging_steps": 1,
372
  "max_steps": 200,
373
  "num_input_tokens_seen": 0,
374
- "num_train_epochs": 1,
375
  "save_steps": 50,
376
  "stateful_callbacks": {
377
  "TrainerControl": {
@@ -385,8 +385,8 @@
385
  "attributes": {}
386
  }
387
  },
388
- "total_flos": 3.645919305579571e+16,
389
- "train_batch_size": 2,
390
  "trial_name": null,
391
  "trial_params": null
392
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4280936454849498,
6
  "eval_steps": 50,
7
  "global_step": 50,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.008561872909698997,
14
+ "grad_norm": 2.1875,
15
  "learning_rate": 0.0,
16
+ "loss": 11.000052452087402,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.017123745819397993,
21
+ "grad_norm": 2.15625,
22
  "learning_rate": 0.0125,
23
+ "loss": 11.001190185546875,
24
  "step": 2
25
  },
26
  {
27
+ "epoch": 0.02568561872909699,
28
+ "grad_norm": 1.7890625,
29
  "learning_rate": 0.025,
30
+ "loss": 10.639176368713379,
31
  "step": 3
32
  },
33
  {
34
+ "epoch": 0.034247491638795986,
35
+ "grad_norm": 1.484375,
36
  "learning_rate": 0.02499842659592344,
37
+ "loss": 10.244665145874023,
38
  "step": 4
39
  },
40
  {
41
+ "epoch": 0.04280936454849498,
42
+ "grad_norm": 1.6796875,
43
  "learning_rate": 0.024993706779789817,
44
+ "loss": 10.011271476745605,
45
  "step": 5
46
  },
47
  {
48
+ "epoch": 0.05137123745819398,
49
+ "grad_norm": 2.671875,
50
  "learning_rate": 0.0249858417397876,
51
+ "loss": 9.91366958618164,
52
  "step": 6
53
  },
54
  {
55
+ "epoch": 0.05993311036789298,
56
+ "grad_norm": 2.0,
57
  "learning_rate": 0.024974833455898557,
58
+ "loss": 9.735193252563477,
59
  "step": 7
60
  },
61
  {
62
+ "epoch": 0.06849498327759197,
63
+ "grad_norm": 1.2890625,
64
  "learning_rate": 0.024960684699399282,
65
+ "loss": 9.460551261901855,
66
  "step": 8
67
  },
68
  {
69
+ "epoch": 0.07705685618729097,
70
+ "grad_norm": 1.0546875,
71
  "learning_rate": 0.024943399032163558,
72
+ "loss": 9.263848304748535,
73
  "step": 9
74
  },
75
  {
76
+ "epoch": 0.08561872909698996,
77
+ "grad_norm": 1.1796875,
78
  "learning_rate": 0.024922980805765677,
79
+ "loss": 9.114842414855957,
80
  "step": 10
81
  },
82
  {
83
+ "epoch": 0.09418060200668896,
84
+ "grad_norm": 0.8984375,
85
  "learning_rate": 0.024899435160384942,
86
+ "loss": 8.910664558410645,
87
  "step": 11
88
  },
89
  {
90
+ "epoch": 0.10274247491638797,
91
+ "grad_norm": 0.90234375,
92
  "learning_rate": 0.02487276802351166,
93
+ "loss": 8.746124267578125,
94
  "step": 12
95
  },
96
  {
97
+ "epoch": 0.11130434782608696,
98
+ "grad_norm": 0.921875,
99
  "learning_rate": 0.02484298610845493,
100
+ "loss": 8.617627143859863,
101
  "step": 13
102
  },
103
  {
104
+ "epoch": 0.11986622073578596,
105
+ "grad_norm": 0.6875,
106
  "learning_rate": 0.024810096912652603,
107
+ "loss": 8.474465370178223,
108
  "step": 14
109
  },
110
  {
111
+ "epoch": 0.12842809364548494,
112
+ "grad_norm": 0.69921875,
113
  "learning_rate": 0.024774108715783835,
114
+ "loss": 8.355995178222656,
115
  "step": 15
116
  },
117
  {
118
+ "epoch": 0.13698996655518395,
119
+ "grad_norm": 0.6328125,
120
  "learning_rate": 0.02473503057768474,
121
+ "loss": 8.227234840393066,
122
  "step": 16
123
  },
124
  {
125
+ "epoch": 0.14555183946488295,
126
+ "grad_norm": 0.69921875,
127
  "learning_rate": 0.02469287233606759,
128
+ "loss": 8.151434898376465,
129
  "step": 17
130
  },
131
  {
132
+ "epoch": 0.15411371237458193,
133
+ "grad_norm": 0.58203125,
134
  "learning_rate": 0.024647644604044273,
135
+ "loss": 8.038224220275879,
136
  "step": 18
137
  },
138
  {
139
+ "epoch": 0.16267558528428094,
140
+ "grad_norm": 0.56640625,
141
  "learning_rate": 0.024599358767454456,
142
+ "loss": 7.988001346588135,
143
  "step": 19
144
  },
145
  {
146
+ "epoch": 0.17123745819397992,
147
+ "grad_norm": 0.76953125,
148
  "learning_rate": 0.024548026981999278,
149
+ "loss": 7.911329746246338,
150
  "step": 20
151
  },
152
  {
153
+ "epoch": 0.17979933110367893,
154
+ "grad_norm": 0.50390625,
155
  "learning_rate": 0.02449366217018122,
156
+ "loss": 7.835102558135986,
157
  "step": 21
158
  },
159
  {
160
+ "epoch": 0.18836120401337791,
161
+ "grad_norm": 0.447265625,
162
  "learning_rate": 0.024436278018050924,
163
+ "loss": 7.749161720275879,
164
  "step": 22
165
  },
166
  {
167
+ "epoch": 0.19692307692307692,
168
+ "grad_norm": 0.470703125,
169
  "learning_rate": 0.02437588897176182,
170
+ "loss": 7.731137752532959,
171
  "step": 23
172
  },
173
  {
174
+ "epoch": 0.20548494983277593,
175
+ "grad_norm": 0.54296875,
176
  "learning_rate": 0.024312510233933354,
177
+ "loss": 7.658270835876465,
178
  "step": 24
179
  },
180
  {
181
+ "epoch": 0.2140468227424749,
182
+ "grad_norm": 0.515625,
183
  "learning_rate": 0.024246157759823857,
184
+ "loss": 7.607485771179199,
185
  "step": 25
186
  },
187
  {
188
+ "epoch": 0.22260869565217392,
189
+ "grad_norm": 0.40625,
190
  "learning_rate": 0.024176848253313836,
191
+ "loss": 7.570281028747559,
192
  "step": 26
193
  },
194
  {
195
+ "epoch": 0.2311705685618729,
196
+ "grad_norm": 0.55078125,
197
  "learning_rate": 0.02410459916270091,
198
+ "loss": 7.528012275695801,
199
  "step": 27
200
  },
201
  {
202
+ "epoch": 0.2397324414715719,
203
  "grad_norm": 0.373046875,
204
  "learning_rate": 0.02402942867630727,
205
+ "loss": 7.462031364440918,
206
  "step": 28
207
  },
208
  {
209
+ "epoch": 0.2482943143812709,
210
+ "grad_norm": 0.3671875,
211
  "learning_rate": 0.02395135571790087,
212
+ "loss": 7.411103248596191,
213
  "step": 29
214
  },
215
  {
216
+ "epoch": 0.2568561872909699,
217
+ "grad_norm": 0.4453125,
218
  "learning_rate": 0.02387039994193148,
219
+ "loss": 7.400269031524658,
220
  "step": 30
221
  },
222
  {
223
+ "epoch": 0.2654180602006689,
224
+ "grad_norm": 0.427734375,
225
  "learning_rate": 0.023786581728582768,
226
+ "loss": 7.331328392028809,
227
  "step": 31
228
  },
229
  {
230
+ "epoch": 0.2739799331103679,
231
+ "grad_norm": 0.58203125,
232
  "learning_rate": 0.023699922178641697,
233
+ "loss": 7.33138370513916,
234
  "step": 32
235
  },
236
  {
237
+ "epoch": 0.2825418060200669,
238
+ "grad_norm": 0.484375,
239
  "learning_rate": 0.023610443108186545,
240
+ "loss": 7.266662120819092,
241
  "step": 33
242
  },
243
  {
244
+ "epoch": 0.2911036789297659,
245
+ "grad_norm": 0.5,
246
  "learning_rate": 0.023518167043094777,
247
+ "loss": 7.242274284362793,
248
  "step": 34
249
  },
250
  {
251
+ "epoch": 0.29966555183946486,
252
+ "grad_norm": 0.431640625,
253
  "learning_rate": 0.023423117213372313,
254
+ "loss": 7.186635494232178,
255
  "step": 35
256
  },
257
  {
258
+ "epoch": 0.30822742474916387,
259
+ "grad_norm": 0.458984375,
260
  "learning_rate": 0.023325317547305487,
261
+ "loss": 7.136233806610107,
262
  "step": 36
263
  },
264
  {
265
+ "epoch": 0.3167892976588629,
266
+ "grad_norm": 0.396484375,
267
  "learning_rate": 0.023224792665437213,
268
+ "loss": 7.15239143371582,
269
  "step": 37
270
  },
271
  {
272
+ "epoch": 0.3253511705685619,
273
+ "grad_norm": 0.390625,
274
  "learning_rate": 0.023121567874368934,
275
+ "loss": 7.045711040496826,
276
  "step": 38
277
  },
278
  {
279
+ "epoch": 0.3339130434782609,
280
+ "grad_norm": 0.3359375,
281
  "learning_rate": 0.023015669160389766,
282
+ "loss": 7.051828384399414,
283
  "step": 39
284
  },
285
  {
286
+ "epoch": 0.34247491638795985,
287
+ "grad_norm": 0.35546875,
288
  "learning_rate": 0.02290712318293464,
289
+ "loss": 6.995121479034424,
290
  "step": 40
291
  },
292
  {
293
+ "epoch": 0.35103678929765886,
294
+ "grad_norm": 0.34375,
295
  "learning_rate": 0.02279595726787291,
296
+ "loss": 6.917529106140137,
297
  "step": 41
298
  },
299
  {
300
+ "epoch": 0.35959866220735787,
301
+ "grad_norm": 0.408203125,
302
  "learning_rate": 0.022682199400629197,
303
+ "loss": 6.943573474884033,
304
  "step": 42
305
  },
306
  {
307
+ "epoch": 0.3681605351170569,
308
+ "grad_norm": 0.486328125,
309
  "learning_rate": 0.022565878219138233,
310
+ "loss": 6.849137306213379,
311
  "step": 43
312
  },
313
  {
314
+ "epoch": 0.37672240802675583,
315
+ "grad_norm": 0.431640625,
316
  "learning_rate": 0.022447023006635404,
317
+ "loss": 6.817243576049805,
318
  "step": 44
319
  },
320
  {
321
+ "epoch": 0.38528428093645484,
322
+ "grad_norm": 0.466796875,
323
  "learning_rate": 0.022325663684284847,
324
+ "loss": 6.751210689544678,
325
  "step": 45
326
  },
327
  {
328
+ "epoch": 0.39384615384615385,
329
+ "grad_norm": 0.44140625,
330
  "learning_rate": 0.02220183080364696,
331
+ "loss": 6.731540679931641,
332
  "step": 46
333
  },
334
  {
335
+ "epoch": 0.40240802675585285,
336
+ "grad_norm": 0.4140625,
337
  "learning_rate": 0.022075555538987227,
338
+ "loss": 6.705706596374512,
339
  "step": 47
340
  },
341
  {
342
+ "epoch": 0.41096989966555186,
343
+ "grad_norm": 0.345703125,
344
  "learning_rate": 0.02194686967942823,
345
+ "loss": 6.678059101104736,
346
  "step": 48
347
  },
348
  {
349
+ "epoch": 0.4195317725752508,
350
+ "grad_norm": 0.4140625,
351
  "learning_rate": 0.021815805620946937,
352
+ "loss": 6.6417555809021,
353
  "step": 49
354
  },
355
  {
356
+ "epoch": 0.4280936454849498,
357
+ "grad_norm": 0.462890625,
358
  "learning_rate": 0.021682396358219166,
359
+ "loss": 6.592331886291504,
360
  "step": 50
361
  },
362
  {
363
+ "epoch": 0.4280936454849498,
364
+ "eval_loss": 6.548539638519287,
365
+ "eval_runtime": 51.5345,
366
+ "eval_samples_per_second": 11.856,
367
+ "eval_steps_per_second": 2.969,
368
  "step": 50
369
  }
370
  ],
371
  "logging_steps": 1,
372
  "max_steps": 200,
373
  "num_input_tokens_seen": 0,
374
+ "num_train_epochs": 2,
375
  "save_steps": 50,
376
  "stateful_callbacks": {
377
  "TrainerControl": {
 
385
  "attributes": {}
386
  }
387
  },
388
+ "total_flos": 7.292068282073088e+16,
389
+ "train_batch_size": 4,
390
  "trial_name": null,
391
  "trial_params": null
392
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25d686833094b637e69f7be1bd7dd0c173bb768a2a8b1916830852d7703b16ed
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9224b9e115d57261205215ceae02ffa4a6e8577fbe762a2cbd2cd5c8a9f1ea82
3
  size 5713