ljcamargo commited on
Commit
69cf09b
·
verified ·
1 Parent(s): ccb5d92

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8123be51ad98480825d2aa361e56c755cf0dc6e020205917ceed574c4ae39f89
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e062c064398a956fac974a79f09e1c9659956a9fdf96df5c70aa72db86396863
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea95dc19f159c03247df4100c2ddfb737ed3834a1ab387bcd4b1cd06eb816c19
3
- size 2457458917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45f3f458d13720e9a7d4cd7e4225dcd1cc7c188cc14bfe1f5cdf1c81c33315ba
3
+ size 2457459557
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5323bfb1fda121bfdfa4891bfab6888dd68e61e60302d177fa061000384bd8
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81d4f4b1fec8227486261e0ca0332075e5277c747f156631e8baf30d09642001
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22a563685e8553d4bb36da2a1e276a977d883dcf7c13f91e157ed2e26e002108
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b05714b3f7fdb6eaa769e652ab97d810715e0b9a1f62855693cf5929568c9e83
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1,
6
  "eval_steps": 500,
7
- "global_step": 250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -183,6 +183,181 @@
183
  "learning_rate": 4.546370967741936e-05,
184
  "loss": 1.262,
185
  "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  }
187
  ],
188
  "logging_steps": 10,
@@ -202,7 +377,7 @@
202
  "attributes": {}
203
  }
204
  },
205
- "total_flos": 4520071282176000.0,
206
  "train_batch_size": 2,
207
  "trial_name": null,
208
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2,
6
  "eval_steps": 500,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
183
  "learning_rate": 4.546370967741936e-05,
184
  "loss": 1.262,
185
  "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.104,
189
+ "grad_norm": 33.3493537902832,
190
+ "learning_rate": 4.526209677419355e-05,
191
+ "loss": 1.2206,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.108,
196
+ "grad_norm": 21.672395706176758,
197
+ "learning_rate": 4.506048387096775e-05,
198
+ "loss": 1.1687,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.112,
203
+ "grad_norm": 70.3603286743164,
204
+ "learning_rate": 4.485887096774194e-05,
205
+ "loss": 1.2836,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.116,
210
+ "grad_norm": 32.152740478515625,
211
+ "learning_rate": 4.465725806451613e-05,
212
+ "loss": 1.3799,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.12,
217
+ "grad_norm": 25.037168502807617,
218
+ "learning_rate": 4.4455645161290325e-05,
219
+ "loss": 1.5239,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.124,
224
+ "grad_norm": 19.55396842956543,
225
+ "learning_rate": 4.425403225806452e-05,
226
+ "loss": 1.1581,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.128,
231
+ "grad_norm": 21.596759796142578,
232
+ "learning_rate": 4.4052419354838714e-05,
233
+ "loss": 1.314,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.132,
238
+ "grad_norm": 15.211933135986328,
239
+ "learning_rate": 4.385080645161291e-05,
240
+ "loss": 1.1852,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.136,
245
+ "grad_norm": 24.59844970703125,
246
+ "learning_rate": 4.36491935483871e-05,
247
+ "loss": 1.1092,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.14,
252
+ "grad_norm": 23.929607391357422,
253
+ "learning_rate": 4.344758064516129e-05,
254
+ "loss": 0.98,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.144,
259
+ "grad_norm": 22.07075309753418,
260
+ "learning_rate": 4.3245967741935486e-05,
261
+ "loss": 1.2239,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.148,
266
+ "grad_norm": 18.487125396728516,
267
+ "learning_rate": 4.3044354838709674e-05,
268
+ "loss": 1.3897,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.152,
273
+ "grad_norm": 30.2105712890625,
274
+ "learning_rate": 4.284274193548387e-05,
275
+ "loss": 1.0571,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.156,
280
+ "grad_norm": 29.202199935913086,
281
+ "learning_rate": 4.2641129032258064e-05,
282
+ "loss": 1.1788,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.16,
287
+ "grad_norm": 19.897415161132812,
288
+ "learning_rate": 4.243951612903226e-05,
289
+ "loss": 0.9951,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.164,
294
+ "grad_norm": 14.207056999206543,
295
+ "learning_rate": 4.2237903225806454e-05,
296
+ "loss": 1.287,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.168,
301
+ "grad_norm": 20.537015914916992,
302
+ "learning_rate": 4.203629032258065e-05,
303
+ "loss": 1.1353,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.172,
308
+ "grad_norm": 18.372892379760742,
309
+ "learning_rate": 4.1834677419354836e-05,
310
+ "loss": 1.0423,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.176,
315
+ "grad_norm": 37.1649284362793,
316
+ "learning_rate": 4.163306451612903e-05,
317
+ "loss": 0.95,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.18,
322
+ "grad_norm": 16.673492431640625,
323
+ "learning_rate": 4.1431451612903226e-05,
324
+ "loss": 0.9634,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.184,
329
+ "grad_norm": 21.696840286254883,
330
+ "learning_rate": 4.122983870967742e-05,
331
+ "loss": 1.2075,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.188,
336
+ "grad_norm": 18.72450065612793,
337
+ "learning_rate": 4.1028225806451615e-05,
338
+ "loss": 1.0212,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.192,
343
+ "grad_norm": 25.213973999023438,
344
+ "learning_rate": 4.082661290322581e-05,
345
+ "loss": 0.9902,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.196,
350
+ "grad_norm": 22.240453720092773,
351
+ "learning_rate": 4.0625000000000005e-05,
352
+ "loss": 0.9929,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.2,
357
+ "grad_norm": 18.012147903442383,
358
+ "learning_rate": 4.042338709677419e-05,
359
+ "loss": 0.9574,
360
+ "step": 500
361
  }
362
  ],
363
  "logging_steps": 10,
 
377
  "attributes": {}
378
  }
379
  },
380
+ "total_flos": 9036323224934400.0,
381
  "train_batch_size": 2,
382
  "trial_name": null,
383
  "trial_params": null