irishprancer commited on
Commit
636ecc9
·
verified ·
1 Parent(s): 45db79f

Training in progress, step 450, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0efe3b1a63bdf5716dca804ea64c8e52657064ebb6dc717f4792370cb487673
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372891febfadfccbabac9570878fa86511c85965c83b7adbeef55c8d100f4f2d
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bb888b4e54da7ea7f8f7484598145948a007a101d68fd0d5166d01828fe0578
3
  size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:620d4f511180616a4534055c09585878071e140c18fdd5dc3beb5a71366c356b
3
  size 1054136250
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:226f394c3a9826cc7f74d0799aa02f643f1ee6b891784f44c588787dbc9c0cb3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf065c84ff75b4c8bc24f08fcd1880a75e81b5b99444434709d4c17d68aad0f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2673d78ac7304a2a7678ae71ed65422fa2295f07aca63cf23ca76e0b5c92da69
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb248e7cc2fe7b509c9e866be7b72af3b33225d8b86373c1a62393cc3a24f4da
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7685600519180298,
3
- "best_model_checkpoint": "./output/checkpoint-150",
4
- "epoch": 3.3333333333333335,
5
  "eval_steps": 150,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -120,6 +120,288 @@
120
  "eval_samples_per_second": 22.169,
121
  "eval_steps_per_second": 22.169,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
@@ -139,7 +421,7 @@
139
  "attributes": {}
140
  }
141
  },
142
- "total_flos": 1615667282657280.0,
143
  "train_batch_size": 2,
144
  "trial_name": null,
145
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7259252071380615,
3
+ "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 10.0,
5
  "eval_steps": 150,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
120
  "eval_samples_per_second": 22.169,
121
  "eval_steps_per_second": 22.169,
122
  "step": 150
123
+ },
124
+ {
125
+ "epoch": 3.5555555555555554,
126
+ "grad_norm": 2.560298204421997,
127
+ "learning_rate": 2.9999892980750276e-05,
128
+ "loss": 0.6943,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 3.7777777777777777,
133
+ "grad_norm": 2.3921022415161133,
134
+ "learning_rate": 2.9999854334972655e-05,
135
+ "loss": 0.6927,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 4.0,
140
+ "grad_norm": 2.541400909423828,
141
+ "learning_rate": 2.999980974373202e-05,
142
+ "loss": 0.6681,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 4.222222222222222,
147
+ "grad_norm": 1.6048011779785156,
148
+ "learning_rate": 2.9999759207046055e-05,
149
+ "loss": 0.59,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 4.444444444444445,
154
+ "grad_norm": 1.6808319091796875,
155
+ "learning_rate": 2.9999702724934783e-05,
156
+ "loss": 0.7109,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 4.666666666666667,
161
+ "grad_norm": 3.4204533100128174,
162
+ "learning_rate": 2.99996402974206e-05,
163
+ "loss": 0.6277,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 4.888888888888889,
168
+ "grad_norm": 2.078854560852051,
169
+ "learning_rate": 2.9999571924528243e-05,
170
+ "loss": 0.6732,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 5.111111111111111,
175
+ "grad_norm": 2.6296238899230957,
176
+ "learning_rate": 2.9999497606284816e-05,
177
+ "loss": 0.6029,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 5.333333333333333,
182
+ "grad_norm": 1.4844911098480225,
183
+ "learning_rate": 2.9999417342719775e-05,
184
+ "loss": 0.6941,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 5.555555555555555,
189
+ "grad_norm": 2.2181289196014404,
190
+ "learning_rate": 2.9999331133864935e-05,
191
+ "loss": 0.6476,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 5.777777777777778,
196
+ "grad_norm": 1.705496907234192,
197
+ "learning_rate": 2.9999238979754465e-05,
198
+ "loss": 0.6095,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 6.0,
203
+ "grad_norm": 1.9006123542785645,
204
+ "learning_rate": 2.99991408804249e-05,
205
+ "loss": 0.5755,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 6.222222222222222,
210
+ "grad_norm": 2.1468276977539062,
211
+ "learning_rate": 2.999903683591511e-05,
212
+ "loss": 0.5742,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 6.444444444444445,
217
+ "grad_norm": 1.3946986198425293,
218
+ "learning_rate": 2.9998926846266345e-05,
219
+ "loss": 0.6137,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 6.666666666666667,
224
+ "grad_norm": 2.292116641998291,
225
+ "learning_rate": 2.9998810911522193e-05,
226
+ "loss": 0.6226,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 6.666666666666667,
231
+ "eval_loss": 0.7391407489776611,
232
+ "eval_runtime": 0.5623,
233
+ "eval_samples_per_second": 17.783,
234
+ "eval_steps_per_second": 17.783,
235
+ "step": 300
236
+ },
237
+ {
238
+ "Start_State_loss": 0.7391407489776611,
239
+ "Start_State_runtime": 0.5004,
240
+ "Start_State_samples_per_second": 19.984,
241
+ "Start_State_steps_per_second": 19.984,
242
+ "epoch": 6.666666666666667,
243
+ "step": 300
244
+ },
245
+ {
246
+ "SWA_loss": 0.7391407489776611,
247
+ "SWA_runtime": 0.5139,
248
+ "SWA_samples_per_second": 19.46,
249
+ "SWA_steps_per_second": 19.46,
250
+ "epoch": 6.666666666666667,
251
+ "step": 300
252
+ },
253
+ {
254
+ "EMA_loss": 0.7391407489776611,
255
+ "EMA_runtime": 0.515,
256
+ "EMA_samples_per_second": 19.417,
257
+ "EMA_steps_per_second": 19.417,
258
+ "epoch": 6.666666666666667,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 6.888888888888889,
263
+ "grad_norm": 2.7107701301574707,
264
+ "learning_rate": 2.9998689031728615e-05,
265
+ "loss": 0.6481,
266
+ "step": 310
267
+ },
268
+ {
269
+ "epoch": 7.111111111111111,
270
+ "grad_norm": 1.764064908027649,
271
+ "learning_rate": 2.9998561206933918e-05,
272
+ "loss": 0.5866,
273
+ "step": 320
274
+ },
275
+ {
276
+ "epoch": 7.333333333333333,
277
+ "grad_norm": 1.7632637023925781,
278
+ "learning_rate": 2.9998427437188766e-05,
279
+ "loss": 0.5798,
280
+ "step": 330
281
+ },
282
+ {
283
+ "epoch": 7.555555555555555,
284
+ "grad_norm": 2.3483335971832275,
285
+ "learning_rate": 2.999828772254618e-05,
286
+ "loss": 0.6034,
287
+ "step": 340
288
+ },
289
+ {
290
+ "epoch": 7.777777777777778,
291
+ "grad_norm": 2.47190260887146,
292
+ "learning_rate": 2.9998142063061544e-05,
293
+ "loss": 0.662,
294
+ "step": 350
295
+ },
296
+ {
297
+ "epoch": 8.0,
298
+ "grad_norm": 1.323142409324646,
299
+ "learning_rate": 2.9997990458792583e-05,
300
+ "loss": 0.6039,
301
+ "step": 360
302
+ },
303
+ {
304
+ "epoch": 8.222222222222221,
305
+ "grad_norm": 1.9020463228225708,
306
+ "learning_rate": 2.9997832909799397e-05,
307
+ "loss": 0.5489,
308
+ "step": 370
309
+ },
310
+ {
311
+ "epoch": 8.444444444444445,
312
+ "grad_norm": 1.9343500137329102,
313
+ "learning_rate": 2.9997669416144432e-05,
314
+ "loss": 0.641,
315
+ "step": 380
316
+ },
317
+ {
318
+ "epoch": 8.666666666666666,
319
+ "grad_norm": 1.0505070686340332,
320
+ "learning_rate": 2.999749997789249e-05,
321
+ "loss": 0.5396,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 8.88888888888889,
326
+ "grad_norm": 1.5202258825302124,
327
+ "learning_rate": 2.9997324595110723e-05,
328
+ "loss": 0.6543,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 9.11111111111111,
333
+ "grad_norm": 1.384507656097412,
334
+ "learning_rate": 2.9997143267868663e-05,
335
+ "loss": 0.5946,
336
+ "step": 410
337
+ },
338
+ {
339
+ "epoch": 9.333333333333334,
340
+ "grad_norm": 2.468230962753296,
341
+ "learning_rate": 2.999695599623817e-05,
342
+ "loss": 0.6231,
343
+ "step": 420
344
+ },
345
+ {
346
+ "epoch": 9.555555555555555,
347
+ "grad_norm": 2.60021710395813,
348
+ "learning_rate": 2.9996762780293483e-05,
349
+ "loss": 0.575,
350
+ "step": 430
351
+ },
352
+ {
353
+ "epoch": 9.777777777777779,
354
+ "grad_norm": 1.5323718786239624,
355
+ "learning_rate": 2.9996563620111176e-05,
356
+ "loss": 0.529,
357
+ "step": 440
358
+ },
359
+ {
360
+ "epoch": 10.0,
361
+ "grad_norm": 1.3856033086776733,
362
+ "learning_rate": 2.9996358515770198e-05,
363
+ "loss": 0.5417,
364
+ "step": 450
365
+ },
366
+ {
367
+ "epoch": 10.0,
368
+ "eval_loss": 0.7259252071380615,
369
+ "eval_runtime": 0.4435,
370
+ "eval_samples_per_second": 22.546,
371
+ "eval_steps_per_second": 22.546,
372
+ "step": 450
373
+ },
374
+ {
375
+ "Start_State_loss": 0.7391407489776611,
376
+ "Start_State_runtime": 0.4445,
377
+ "Start_State_samples_per_second": 22.495,
378
+ "Start_State_steps_per_second": 22.495,
379
+ "epoch": 10.0,
380
+ "step": 450
381
+ },
382
+ {
383
+ "Raw_Model_loss": 0.7259252071380615,
384
+ "Raw_Model_runtime": 0.4437,
385
+ "Raw_Model_samples_per_second": 22.539,
386
+ "Raw_Model_steps_per_second": 22.539,
387
+ "epoch": 10.0,
388
+ "step": 450
389
+ },
390
+ {
391
+ "SWA_loss": 0.733832836151123,
392
+ "SWA_runtime": 0.4445,
393
+ "SWA_samples_per_second": 22.496,
394
+ "SWA_steps_per_second": 22.496,
395
+ "epoch": 10.0,
396
+ "step": 450
397
+ },
398
+ {
399
+ "EMA_loss": 0.738980233669281,
400
+ "EMA_runtime": 0.4444,
401
+ "EMA_samples_per_second": 22.503,
402
+ "EMA_steps_per_second": 22.503,
403
+ "epoch": 10.0,
404
+ "step": 450
405
  }
406
  ],
407
  "logging_steps": 10,
 
421
  "attributes": {}
422
  }
423
  },
424
+ "total_flos": 4801636770840576.0,
425
  "train_batch_size": 2,
426
  "trial_name": null,
427
  "trial_params": null