irishprancer commited on
Commit
d59223c
·
verified ·
1 Parent(s): c4844b9

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eba20958cbb4baef53f6228a9a06deeae50c5a2f40d1fba26afd7caccd2ea83c
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:348daef4d62db9a5fa3b649cca6826fc23ec3c49e1fb43eaa81d96b56a38718c
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fc4a217d70fa350a2b486bc3b1da2dd3b854e14e24169dafa1e11fdf2d7112d
3
  size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:494a5ea21271cf0928971accb090d8e84141054b34ceaff64cbe4817f22fea4b
3
  size 1054136250
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed14e15604e1097b80da74a65c68f380dc6bb673bf5694a945c25e7931ad5a75
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65f6c1dd354391c569e4be3e0b1b637345be25b99bb32967d995788c70f82738
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb248e7cc2fe7b509c9e866be7b72af3b33225d8b86373c1a62393cc3a24f4da
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ca6ba6bc5430af54b4982610d295ab940fcfca9eb66d0098ad0a404420eb1d4
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,351 +1,464 @@
1
  {
2
- "best_metric": 0.7266653776168823,
3
- "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 10.0,
5
  "eval_steps": 150,
6
- "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2222222222222222,
13
- "grad_norm": 1.9097713232040405,
14
  "learning_rate": 2.9999999999999984e-06,
15
- "loss": 0.6618,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.4444444444444444,
20
- "grad_norm": 1.888712763786316,
21
  "learning_rate": 5.999999999999997e-06,
22
- "loss": 0.7252,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6666666666666666,
27
- "grad_norm": 1.6720666885375977,
28
  "learning_rate": 8.999999999999993e-06,
29
- "loss": 0.7332,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.8888888888888888,
34
- "grad_norm": 1.5767813920974731,
35
  "learning_rate": 1.1999999999999994e-05,
36
- "loss": 0.7368,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.1111111111111112,
41
- "grad_norm": 1.2497552633285522,
42
  "learning_rate": 1.499999999999999e-05,
43
- "loss": 0.8943,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.3333333333333333,
48
- "grad_norm": 1.1341147422790527,
49
  "learning_rate": 1.7999999999999987e-05,
50
- "loss": 0.7142,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.5555555555555556,
55
- "grad_norm": 1.5113084316253662,
56
  "learning_rate": 2.0999999999999985e-05,
57
- "loss": 0.7582,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.7777777777777777,
62
- "grad_norm": 1.8728289604187012,
63
  "learning_rate": 2.3999999999999987e-05,
64
- "loss": 0.7075,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.0,
69
- "grad_norm": 2.064190149307251,
70
  "learning_rate": 2.6999999999999982e-05,
71
- "loss": 0.7701,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.2222222222222223,
76
- "grad_norm": 1.5590786933898926,
77
  "learning_rate": 2.999999999999998e-05,
78
- "loss": 0.7272,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 2.4444444444444446,
83
- "grad_norm": 1.7018474340438843,
84
  "learning_rate": 2.999999702723961e-05,
85
- "loss": 0.7472,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 2.6666666666666665,
90
- "grad_norm": 2.7952871322631836,
91
  "learning_rate": 2.9999988108959667e-05,
92
  "loss": 0.57,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 2.888888888888889,
97
- "grad_norm": 2.1137964725494385,
98
  "learning_rate": 2.9999973245163695e-05,
99
- "loss": 0.7048,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.111111111111111,
104
- "grad_norm": 2.2451610565185547,
105
  "learning_rate": 2.999995243585758e-05,
106
- "loss": 0.7171,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 3.3333333333333335,
111
- "grad_norm": 1.9218461513519287,
112
  "learning_rate": 2.9999925681049573e-05,
113
- "loss": 0.6642,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 3.3333333333333335,
118
- "eval_loss": 0.7679360508918762,
119
- "eval_runtime": 0.4614,
120
- "eval_samples_per_second": 21.674,
121
- "eval_steps_per_second": 21.674,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 3.5555555555555554,
126
- "grad_norm": 2.5762360095977783,
127
  "learning_rate": 2.9999892980750276e-05,
128
- "loss": 0.695,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 3.7777777777777777,
133
- "grad_norm": 2.3902077674865723,
134
  "learning_rate": 2.9999854334972655e-05,
135
- "loss": 0.6922,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 4.0,
140
- "grad_norm": 2.540234327316284,
141
  "learning_rate": 2.999980974373202e-05,
142
- "loss": 0.6677,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 4.222222222222222,
147
- "grad_norm": 1.588454008102417,
148
  "learning_rate": 2.9999759207046055e-05,
149
- "loss": 0.5905,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 4.444444444444445,
154
- "grad_norm": 1.6733808517456055,
155
  "learning_rate": 2.9999702724934783e-05,
156
- "loss": 0.7107,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 4.666666666666667,
161
- "grad_norm": 3.4343762397766113,
162
  "learning_rate": 2.99996402974206e-05,
163
  "loss": 0.6277,
164
  "step": 210
165
  },
166
  {
167
  "epoch": 4.888888888888889,
168
- "grad_norm": 2.086362838745117,
169
  "learning_rate": 2.9999571924528243e-05,
170
- "loss": 0.6739,
171
  "step": 220
172
  },
173
  {
174
  "epoch": 5.111111111111111,
175
- "grad_norm": 2.628204822540283,
176
  "learning_rate": 2.9999497606284816e-05,
177
- "loss": 0.6024,
178
  "step": 230
179
  },
180
  {
181
  "epoch": 5.333333333333333,
182
- "grad_norm": 1.4835840463638306,
183
  "learning_rate": 2.9999417342719775e-05,
184
- "loss": 0.6943,
185
  "step": 240
186
  },
187
  {
188
  "epoch": 5.555555555555555,
189
- "grad_norm": 2.2181918621063232,
190
  "learning_rate": 2.9999331133864935e-05,
191
- "loss": 0.6474,
192
  "step": 250
193
  },
194
  {
195
  "epoch": 5.777777777777778,
196
- "grad_norm": 1.7070634365081787,
197
  "learning_rate": 2.9999238979754465e-05,
198
- "loss": 0.6096,
199
  "step": 260
200
  },
201
  {
202
  "epoch": 6.0,
203
- "grad_norm": 1.9004781246185303,
204
  "learning_rate": 2.99991408804249e-05,
205
- "loss": 0.5758,
206
  "step": 270
207
  },
208
  {
209
  "epoch": 6.222222222222222,
210
- "grad_norm": 2.1397156715393066,
211
  "learning_rate": 2.999903683591511e-05,
212
- "loss": 0.5736,
213
  "step": 280
214
  },
215
  {
216
  "epoch": 6.444444444444445,
217
- "grad_norm": 1.384970784187317,
218
  "learning_rate": 2.9998926846266345e-05,
219
- "loss": 0.6135,
220
  "step": 290
221
  },
222
  {
223
  "epoch": 6.666666666666667,
224
- "grad_norm": 2.2915239334106445,
225
  "learning_rate": 2.9998810911522193e-05,
226
- "loss": 0.6228,
227
  "step": 300
228
  },
229
  {
230
  "epoch": 6.666666666666667,
231
- "eval_loss": 0.7389177083969116,
232
- "eval_runtime": 0.4555,
233
- "eval_samples_per_second": 21.953,
234
- "eval_steps_per_second": 21.953,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 6.888888888888889,
239
- "grad_norm": 2.712688446044922,
240
  "learning_rate": 2.9998689031728615e-05,
241
- "loss": 0.6483,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 7.111111111111111,
246
- "grad_norm": 1.764075517654419,
247
  "learning_rate": 2.9998561206933918e-05,
248
- "loss": 0.5864,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 7.333333333333333,
253
- "grad_norm": 1.7712483406066895,
254
  "learning_rate": 2.9998427437188766e-05,
255
- "loss": 0.5796,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 7.555555555555555,
260
- "grad_norm": 2.3483595848083496,
261
  "learning_rate": 2.999828772254618e-05,
262
- "loss": 0.6038,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 7.777777777777778,
267
- "grad_norm": 2.456615686416626,
268
  "learning_rate": 2.9998142063061544e-05,
269
- "loss": 0.6626,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 8.0,
274
- "grad_norm": 1.322672963142395,
275
  "learning_rate": 2.9997990458792583e-05,
276
- "loss": 0.6039,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 8.222222222222221,
281
- "grad_norm": 1.9034680128097534,
282
  "learning_rate": 2.9997832909799397e-05,
283
- "loss": 0.5495,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 8.444444444444445,
288
- "grad_norm": 1.9364817142486572,
289
  "learning_rate": 2.9997669416144432e-05,
290
- "loss": 0.6405,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 8.666666666666666,
295
- "grad_norm": 1.0501103401184082,
296
  "learning_rate": 2.999749997789249e-05,
297
- "loss": 0.5397,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 8.88888888888889,
302
- "grad_norm": 1.5247255563735962,
303
  "learning_rate": 2.9997324595110723e-05,
304
  "loss": 0.6544,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 9.11111111111111,
309
- "grad_norm": 1.3862415552139282,
310
  "learning_rate": 2.9997143267868663e-05,
311
- "loss": 0.595,
312
  "step": 410
313
  },
314
  {
315
  "epoch": 9.333333333333334,
316
- "grad_norm": 2.4849624633789062,
317
  "learning_rate": 2.999695599623817e-05,
318
- "loss": 0.6229,
319
  "step": 420
320
  },
321
  {
322
  "epoch": 9.555555555555555,
323
- "grad_norm": 2.6004281044006348,
324
  "learning_rate": 2.9996762780293483e-05,
325
- "loss": 0.5753,
326
  "step": 430
327
  },
328
  {
329
  "epoch": 9.777777777777779,
330
- "grad_norm": 1.5287330150604248,
331
  "learning_rate": 2.9996563620111176e-05,
332
- "loss": 0.5294,
333
  "step": 440
334
  },
335
  {
336
  "epoch": 10.0,
337
- "grad_norm": 1.3864028453826904,
338
  "learning_rate": 2.9996358515770198e-05,
339
- "loss": 0.5417,
340
  "step": 450
341
  },
342
  {
343
  "epoch": 10.0,
344
- "eval_loss": 0.7266653776168823,
345
- "eval_runtime": 0.4076,
346
- "eval_samples_per_second": 24.534,
347
- "eval_steps_per_second": 24.534,
348
  "step": 450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  }
350
  ],
351
  "logging_steps": 10,
@@ -365,7 +478,7 @@
365
  "attributes": {}
366
  }
367
  },
368
- "total_flos": 4801636770840576.0,
369
  "train_batch_size": 2,
370
  "trial_name": null,
371
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7217289209365845,
3
+ "best_model_checkpoint": "./output/checkpoint-600",
4
+ "epoch": 13.333333333333334,
5
  "eval_steps": 150,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2222222222222222,
13
+ "grad_norm": 1.9086298942565918,
14
  "learning_rate": 2.9999999999999984e-06,
15
+ "loss": 0.6619,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.4444444444444444,
20
+ "grad_norm": 1.888395071029663,
21
  "learning_rate": 5.999999999999997e-06,
22
+ "loss": 0.7259,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6666666666666666,
27
+ "grad_norm": 1.6719470024108887,
28
  "learning_rate": 8.999999999999993e-06,
29
+ "loss": 0.7335,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.8888888888888888,
34
+ "grad_norm": 1.5774726867675781,
35
  "learning_rate": 1.1999999999999994e-05,
36
+ "loss": 0.7373,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.1111111111111112,
41
+ "grad_norm": 1.249552607536316,
42
  "learning_rate": 1.499999999999999e-05,
43
+ "loss": 0.8944,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.3333333333333333,
48
+ "grad_norm": 1.1349461078643799,
49
  "learning_rate": 1.7999999999999987e-05,
50
+ "loss": 0.7144,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.5555555555555556,
55
+ "grad_norm": 1.5111842155456543,
56
  "learning_rate": 2.0999999999999985e-05,
57
+ "loss": 0.7577,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.7777777777777777,
62
+ "grad_norm": 1.873070478439331,
63
  "learning_rate": 2.3999999999999987e-05,
64
+ "loss": 0.7073,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.0,
69
+ "grad_norm": 2.0650975704193115,
70
  "learning_rate": 2.6999999999999982e-05,
71
+ "loss": 0.7702,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.2222222222222223,
76
+ "grad_norm": 1.5584607124328613,
77
  "learning_rate": 2.999999999999998e-05,
78
+ "loss": 0.7269,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 2.4444444444444446,
83
+ "grad_norm": 1.7033145427703857,
84
  "learning_rate": 2.999999702723961e-05,
85
+ "loss": 0.7475,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 2.6666666666666665,
90
+ "grad_norm": 2.7943344116210938,
91
  "learning_rate": 2.9999988108959667e-05,
92
  "loss": 0.57,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 2.888888888888889,
97
+ "grad_norm": 2.112865686416626,
98
  "learning_rate": 2.9999973245163695e-05,
99
+ "loss": 0.704,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.111111111111111,
104
+ "grad_norm": 2.2598509788513184,
105
  "learning_rate": 2.999995243585758e-05,
106
+ "loss": 0.717,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 3.3333333333333335,
111
+ "grad_norm": 1.9296040534973145,
112
  "learning_rate": 2.9999925681049573e-05,
113
+ "loss": 0.6639,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 3.3333333333333335,
118
+ "eval_loss": 0.7678037881851196,
119
+ "eval_runtime": 0.4654,
120
+ "eval_samples_per_second": 21.488,
121
+ "eval_steps_per_second": 21.488,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 3.5555555555555554,
126
+ "grad_norm": 2.576017141342163,
127
  "learning_rate": 2.9999892980750276e-05,
128
+ "loss": 0.6945,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 3.7777777777777777,
133
+ "grad_norm": 2.4086973667144775,
134
  "learning_rate": 2.9999854334972655e-05,
135
+ "loss": 0.6925,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 4.0,
140
+ "grad_norm": 2.5403313636779785,
141
  "learning_rate": 2.999980974373202e-05,
142
+ "loss": 0.6681,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 4.222222222222222,
147
+ "grad_norm": 1.6049163341522217,
148
  "learning_rate": 2.9999759207046055e-05,
149
+ "loss": 0.5901,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 4.444444444444445,
154
+ "grad_norm": 1.6813507080078125,
155
  "learning_rate": 2.9999702724934783e-05,
156
+ "loss": 0.7106,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 4.666666666666667,
161
+ "grad_norm": 3.4500820636749268,
162
  "learning_rate": 2.99996402974206e-05,
163
  "loss": 0.6277,
164
  "step": 210
165
  },
166
  {
167
  "epoch": 4.888888888888889,
168
+ "grad_norm": 2.07940411567688,
169
  "learning_rate": 2.9999571924528243e-05,
170
+ "loss": 0.6731,
171
  "step": 220
172
  },
173
  {
174
  "epoch": 5.111111111111111,
175
+ "grad_norm": 2.62994647026062,
176
  "learning_rate": 2.9999497606284816e-05,
177
+ "loss": 0.6025,
178
  "step": 230
179
  },
180
  {
181
  "epoch": 5.333333333333333,
182
+ "grad_norm": 1.4846452474594116,
183
  "learning_rate": 2.9999417342719775e-05,
184
+ "loss": 0.6941,
185
  "step": 240
186
  },
187
  {
188
  "epoch": 5.555555555555555,
189
+ "grad_norm": 2.218034267425537,
190
  "learning_rate": 2.9999331133864935e-05,
191
+ "loss": 0.6477,
192
  "step": 250
193
  },
194
  {
195
  "epoch": 5.777777777777778,
196
+ "grad_norm": 1.7151379585266113,
197
  "learning_rate": 2.9999238979754465e-05,
198
+ "loss": 0.6094,
199
  "step": 260
200
  },
201
  {
202
  "epoch": 6.0,
203
+ "grad_norm": 1.9011706113815308,
204
  "learning_rate": 2.99991408804249e-05,
205
+ "loss": 0.5759,
206
  "step": 270
207
  },
208
  {
209
  "epoch": 6.222222222222222,
210
+ "grad_norm": 2.1471989154815674,
211
  "learning_rate": 2.999903683591511e-05,
212
+ "loss": 0.574,
213
  "step": 280
214
  },
215
  {
216
  "epoch": 6.444444444444445,
217
+ "grad_norm": 1.3847769498825073,
218
  "learning_rate": 2.9998926846266345e-05,
219
+ "loss": 0.613,
220
  "step": 290
221
  },
222
  {
223
  "epoch": 6.666666666666667,
224
+ "grad_norm": 2.2905008792877197,
225
  "learning_rate": 2.9998810911522193e-05,
226
+ "loss": 0.6224,
227
  "step": 300
228
  },
229
  {
230
  "epoch": 6.666666666666667,
231
+ "eval_loss": 0.7388900518417358,
232
+ "eval_runtime": 0.4459,
233
+ "eval_samples_per_second": 22.426,
234
+ "eval_steps_per_second": 22.426,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 6.888888888888889,
239
+ "grad_norm": 2.6983234882354736,
240
  "learning_rate": 2.9998689031728615e-05,
241
+ "loss": 0.6484,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 7.111111111111111,
246
+ "grad_norm": 1.7625339031219482,
247
  "learning_rate": 2.9998561206933918e-05,
248
+ "loss": 0.5866,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 7.333333333333333,
253
+ "grad_norm": 1.7633429765701294,
254
  "learning_rate": 2.9998427437188766e-05,
255
+ "loss": 0.5797,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 7.555555555555555,
260
+ "grad_norm": 2.347116470336914,
261
  "learning_rate": 2.999828772254618e-05,
262
+ "loss": 0.603,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 7.777777777777778,
267
+ "grad_norm": 2.4734201431274414,
268
  "learning_rate": 2.9998142063061544e-05,
269
+ "loss": 0.6625,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 8.0,
274
+ "grad_norm": 1.330693006515503,
275
  "learning_rate": 2.9997990458792583e-05,
276
+ "loss": 0.6044,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 8.222222222222221,
281
+ "grad_norm": 1.9030860662460327,
282
  "learning_rate": 2.9997832909799397e-05,
283
+ "loss": 0.549,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 8.444444444444445,
288
+ "grad_norm": 1.935556173324585,
289
  "learning_rate": 2.9997669416144432e-05,
290
+ "loss": 0.641,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 8.666666666666666,
295
+ "grad_norm": 1.049513816833496,
296
  "learning_rate": 2.999749997789249e-05,
297
+ "loss": 0.5395,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 8.88888888888889,
302
+ "grad_norm": 1.5243322849273682,
303
  "learning_rate": 2.9997324595110723e-05,
304
  "loss": 0.6544,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 9.11111111111111,
309
+ "grad_norm": 1.3851348161697388,
310
  "learning_rate": 2.9997143267868663e-05,
311
+ "loss": 0.5948,
312
  "step": 410
313
  },
314
  {
315
  "epoch": 9.333333333333334,
316
+ "grad_norm": 2.487696409225464,
317
  "learning_rate": 2.999695599623817e-05,
318
+ "loss": 0.6226,
319
  "step": 420
320
  },
321
  {
322
  "epoch": 9.555555555555555,
323
+ "grad_norm": 2.581589698791504,
324
  "learning_rate": 2.9996762780293483e-05,
325
+ "loss": 0.5751,
326
  "step": 430
327
  },
328
  {
329
  "epoch": 9.777777777777779,
330
+ "grad_norm": 1.5271048545837402,
331
  "learning_rate": 2.9996563620111176e-05,
332
+ "loss": 0.5295,
333
  "step": 440
334
  },
335
  {
336
  "epoch": 10.0,
337
+ "grad_norm": 1.3882054090499878,
338
  "learning_rate": 2.9996358515770198e-05,
339
+ "loss": 0.5419,
340
  "step": 450
341
  },
342
  {
343
  "epoch": 10.0,
344
+ "eval_loss": 0.7256744503974915,
345
+ "eval_runtime": 0.4708,
346
+ "eval_samples_per_second": 21.239,
347
+ "eval_steps_per_second": 21.239,
348
  "step": 450
349
+ },
350
+ {
351
+ "epoch": 10.222222222222221,
352
+ "grad_norm": 2.2227277755737305,
353
+ "learning_rate": 2.9996147467351836e-05,
354
+ "loss": 0.5062,
355
+ "step": 460
356
+ },
357
+ {
358
+ "epoch": 10.444444444444445,
359
+ "grad_norm": 1.412768006324768,
360
+ "learning_rate": 2.9995930474939753e-05,
361
+ "loss": 0.4908,
362
+ "step": 470
363
+ },
364
+ {
365
+ "epoch": 10.666666666666666,
366
+ "grad_norm": 1.9368879795074463,
367
+ "learning_rate": 2.9995707538619954e-05,
368
+ "loss": 0.6362,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 10.88888888888889,
373
+ "grad_norm": 2.137639045715332,
374
+ "learning_rate": 2.9995478658480802e-05,
375
+ "loss": 0.5532,
376
+ "step": 490
377
+ },
378
+ {
379
+ "epoch": 11.11111111111111,
380
+ "grad_norm": 1.867410659790039,
381
+ "learning_rate": 2.9995243834613023e-05,
382
+ "loss": 0.5231,
383
+ "step": 500
384
+ },
385
+ {
386
+ "epoch": 11.333333333333334,
387
+ "grad_norm": 1.6794224977493286,
388
+ "learning_rate": 2.9995003067109687e-05,
389
+ "loss": 0.5388,
390
+ "step": 510
391
+ },
392
+ {
393
+ "epoch": 11.555555555555555,
394
+ "grad_norm": 2.639946699142456,
395
+ "learning_rate": 2.9994756356066226e-05,
396
+ "loss": 0.5847,
397
+ "step": 520
398
+ },
399
+ {
400
+ "epoch": 11.777777777777779,
401
+ "grad_norm": 2.2483253479003906,
402
+ "learning_rate": 2.999450370158044e-05,
403
+ "loss": 0.5348,
404
+ "step": 530
405
+ },
406
+ {
407
+ "epoch": 12.0,
408
+ "grad_norm": 1.535469651222229,
409
+ "learning_rate": 2.9994245103752457e-05,
410
+ "loss": 0.5238,
411
+ "step": 540
412
+ },
413
+ {
414
+ "epoch": 12.222222222222221,
415
+ "grad_norm": 1.2466766834259033,
416
+ "learning_rate": 2.999398056268479e-05,
417
+ "loss": 0.536,
418
+ "step": 550
419
+ },
420
+ {
421
+ "epoch": 12.444444444444445,
422
+ "grad_norm": 1.4720205068588257,
423
+ "learning_rate": 2.9993710078482286e-05,
424
+ "loss": 0.4151,
425
+ "step": 560
426
+ },
427
+ {
428
+ "epoch": 12.666666666666666,
429
+ "grad_norm": 3.3889882564544678,
430
+ "learning_rate": 2.9993433651252164e-05,
431
+ "loss": 0.6201,
432
+ "step": 570
433
+ },
434
+ {
435
+ "epoch": 12.88888888888889,
436
+ "grad_norm": 1.472764253616333,
437
+ "learning_rate": 2.9993151281103986e-05,
438
+ "loss": 0.5345,
439
+ "step": 580
440
+ },
441
+ {
442
+ "epoch": 13.11111111111111,
443
+ "grad_norm": 2.440230369567871,
444
+ "learning_rate": 2.9992862968149675e-05,
445
+ "loss": 0.4178,
446
+ "step": 590
447
+ },
448
+ {
449
+ "epoch": 13.333333333333334,
450
+ "grad_norm": 2.4395759105682373,
451
+ "learning_rate": 2.9992568712503513e-05,
452
+ "loss": 0.5315,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 13.333333333333334,
457
+ "eval_loss": 0.7217289209365845,
458
+ "eval_runtime": 0.4076,
459
+ "eval_samples_per_second": 24.535,
460
+ "eval_steps_per_second": 24.535,
461
+ "step": 600
462
  }
463
  ],
464
  "logging_steps": 10,
 
478
  "attributes": {}
479
  }
480
  },
481
+ "total_flos": 6372295013597184.0,
482
  "train_batch_size": 2,
483
  "trial_name": null,
484
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23ba7417f756bb3cb13785760bc75de6ff69ccfe22374af3547ce2aa84536e79
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44d2192bde2a23766c50facad1c20f2470e5e208bb2f21a9c48d77c7aea22798
3
  size 5496